-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Add a new amdgcn.load.to.lds intrinsic #137425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-mlir-llvm Author: Krzysztof Drewniak (krzysz00) ChangesThis PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the saem API and "gather from a pointer to LDS" is something of an abstract operation. This commet adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang. (Any clang folks know why things are broken?) Patch is 50.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137425.diff 21 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39fef9e4601f8..730fd15913c11 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index ad012d98635ff..a32ef1c2a5a12 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -564,6 +564,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
+ return emitBuiltinWithOneOverloadedType<5>(*this, E,
+ Intrinsic::amdgcn_load_to_lds);
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a6366aceec2a6..e6414a623b929 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 8251d6c213e3d..4b73347ac8155 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
}
+// CHECK-LABEL: @test_load_to_lds_96(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_96(global void* src, local void *dst) {
+ __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_128(global void* src, local void *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
+}
+
// CHECK-LABEL: @test_global_load_lds_96(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
new file mode 100644
index 0000000000000..6cdedb33bdd80
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+// CHECK-LABEL: @test_load_to_lds_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u32(global u32* src, local u32 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u16(global u16* src, local u16 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u8(global u8* src, local u8 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+}
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6fb206e4df188..d86fc74fe2889 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -102,6 +102,14 @@ Changes to the AMDGPU Backend
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
+* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
+intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
+(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
+represented in the IR without needing to use buffer resource intrinsics directly.
+This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
+buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
+optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a57eb4a6dba49..3c9886a01d757 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2641,6 +2641,27 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+/// This is a general-purpose intrinsic for all operations that take a pointer
+/// a base location in LDS, and a data size and use it to perform a gather to LDS.
+/// This allows abstracting over both global pointers (address space 1) and
+/// the buffer-resource-wrapper pointers (address space 7 and 9).
+/// TODO: add support for address space 5 and scratch_load_lds.
+class AMDGPULoadToLDS :
+ ClangBuiltin<"__builtin_amdgcn_load_to_lds">,
+ Intrinsic <
+ [],
+ [llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
+ LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
+ llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
+ llvm_i32_ty, // imm offset (applied to both input and LDS address)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+ // bit 1 = sc1,
+ // bit 4 = scc))
+ [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]>;
+def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
+
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2fa03e3964207..907b5b7e705d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
return selectBufferLoadLds(I);
+ // Until we can store both the address space of the global and the LDS
+ // arguments by having tto MachineMemOperands on an intrinsic, we just trust
+ // that the argument is a global pointer (buffer pointers have been handled by
+ // a LLVM IR-level lowering).
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
return selectGlobalLoadLds(I);
case Intrinsic::amdgcn_exp_compr:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 7163ad2aa7dca..f86aafdf08f9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_load_to_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_load_to_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1d0e81db5a5db..6085c8d584af2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 6); // soffset
return;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5273,6 +5274,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c05ba42d999e9..c686bb00bc286 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1450,6 +1450,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
@@ -1531,6 +1532,7 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
Ptr = II->getArgOperand(0);
break;
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
Ptr = II->getArgOperand(1);
break;
@@ -10219,6 +10221,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(Load, 0);
}
+ // Buffers are handled by LowerBufferFatPointers, and we're going to go
+ // for "trust me" that the remaining cases are global pointers until
+ // such time as we can put two mem operands on an intrinsic.
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
@@ -10249,7 +10255,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
break;
}
- auto *M = cast<MemSDNode>(Op);
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
SmallVector<SDValue, 6> Ops;
@@ -10289,6 +10294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
+ auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = Op->getConstantOperandVal(5);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
new file mode 100644
index 0000000000000..72ef6963c9976
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
+
+; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds
+
+; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds),
+
+;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to
+;; the correct intrinsic.
+
+declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+;---------------------------------------------------------------------y
+; dwordx3
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
+; GFX950-NEXT: s_endpgm
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX950-NEXT: s_mov_b32 m0, s5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
+; GFX950-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+ ret void
+}
+
+;---------------------------------------------------------------------
+; dwordx4
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
+; GFX950-NEXT: s_endpgm
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Krzysztof Drewniak (krzysz00) ChangesThis PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the saem API and "gather from a pointer to LDS" is something of an abstract operation. This commet adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang. (Any clang folks know why things are broken?) Patch is 50.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137425.diff 21 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39fef9e4601f8..730fd15913c11 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index ad012d98635ff..a32ef1c2a5a12 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -564,6 +564,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
+ return emitBuiltinWithOneOverloadedType<5>(*this, E,
+ Intrinsic::amdgcn_load_to_lds);
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a6366aceec2a6..e6414a623b929 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 8251d6c213e3d..4b73347ac8155 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
}
+// CHECK-LABEL: @test_load_to_lds_96(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_96(global void* src, local void *dst) {
+ __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_128(global void* src, local void *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
+}
+
// CHECK-LABEL: @test_global_load_lds_96(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
new file mode 100644
index 0000000000000..6cdedb33bdd80
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+// CHECK-LABEL: @test_load_to_lds_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u32(global u32* src, local u32 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u16(global u16* src, local u16 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u8(global u8* src, local u8 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+}
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6fb206e4df188..d86fc74fe2889 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -102,6 +102,14 @@ Changes to the AMDGPU Backend
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
+* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
+intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
+(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
+represented in the IR without needing to use buffer resource intrinsics directly.
+This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
+buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
+optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a57eb4a6dba49..3c9886a01d757 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2641,6 +2641,27 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+/// This is a general-purpose intrinsic for all operations that take a pointer
+/// a base location in LDS, and a data size and use it to perform a gather to LDS.
+/// This allows abstracting over both global pointers (address space 1) and
+/// the buffer-resource-wrapper pointers (address space 7 and 9).
+/// TODO: add support for address space 5 and scratch_load_lds.
+class AMDGPULoadToLDS :
+ ClangBuiltin<"__builtin_amdgcn_load_to_lds">,
+ Intrinsic <
+ [],
+ [llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
+ LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
+ llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
+ llvm_i32_ty, // imm offset (applied to both input and LDS address)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+ // bit 1 = sc1,
+ // bit 4 = scc))
+ [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]>;
+def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
+
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2fa03e3964207..907b5b7e705d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
return selectBufferLoadLds(I);
+ // Until we can store both the address space of the global and the LDS
+ // arguments by having tto MachineMemOperands on an intrinsic, we just trust
+ // that the argument is a global pointer (buffer pointers have been handled by
+ // a LLVM IR-level lowering).
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
return selectGlobalLoadLds(I);
case Intrinsic::amdgcn_exp_compr:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 7163ad2aa7dca..f86aafdf08f9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_load_to_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_load_to_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1d0e81db5a5db..6085c8d584af2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 6); // soffset
return;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5273,6 +5274,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c05ba42d999e9..c686bb00bc286 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1450,6 +1450,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
@@ -1531,6 +1532,7 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
Ptr = II->getArgOperand(0);
break;
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
Ptr = II->getArgOperand(1);
break;
@@ -10219,6 +10221,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(Load, 0);
}
+ // Buffers are handled by LowerBufferFatPointers, and we're going to go
+ // for "trust me" that the remaining cases are global pointers until
+ // such time as we can put two mem operands on an intrinsic.
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
@@ -10249,7 +10255,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
break;
}
- auto *M = cast<MemSDNode>(Op);
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
SmallVector<SDValue, 6> Ops;
@@ -10289,6 +10294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
+ auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = Op->getConstantOperandVal(5);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
new file mode 100644
index 0000000000000..72ef6963c9976
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
+
+; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds
+
+; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds),
+
+;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to
+;; the correct intrinsic.
+
+declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+;---------------------------------------------------------------------y
+; dwordx3
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
+; GFX950-NEXT: s_endpgm
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX950-NEXT: s_mov_b32 m0, s5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
+; GFX950-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+ ret void
+}
+
+;---------------------------------------------------------------------
+; dwordx4
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
+; GFX950-NEXT: s_endpgm
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr...
[truncated]
|
@llvm/pr-subscribers-mlir Author: Krzysztof Drewniak (krzysz00) ChangesThis PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the saem API and "gather from a pointer to LDS" is something of an abstract operation. This commet adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang. (Any clang folks know why things are broken?) Patch is 50.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137425.diff 21 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39fef9e4601f8..730fd15913c11 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index ad012d98635ff..a32ef1c2a5a12 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -564,6 +564,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
+ return emitBuiltinWithOneOverloadedType<5>(*this, E,
+ Intrinsic::amdgcn_load_to_lds);
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a6366aceec2a6..e6414a623b929 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 8251d6c213e3d..4b73347ac8155 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
}
+// CHECK-LABEL: @test_load_to_lds_96(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_96(global void* src, local void *dst) {
+ __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_128(global void* src, local void *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
+}
+
// CHECK-LABEL: @test_global_load_lds_96(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
new file mode 100644
index 0000000000000..6cdedb33bdd80
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+// CHECK-LABEL: @test_load_to_lds_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u32(global u32* src, local u32 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u16(global u16* src, local u16 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u8(global u8* src, local u8 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+}
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6fb206e4df188..d86fc74fe2889 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -102,6 +102,14 @@ Changes to the AMDGPU Backend
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
+* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
+intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
+(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
+represented in the IR without needing to use buffer resource intrinsics directly.
+This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
+buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
+optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a57eb4a6dba49..3c9886a01d757 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2641,6 +2641,27 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+/// This is a general-purpose intrinsic for all operations that take a pointer
+/// a base location in LDS, and a data size and use it to perform a gather to LDS.
+/// This allows abstracting over both global pointers (address space 1) and
+/// the buffer-resource-wrapper pointers (address space 7 and 9).
+/// TODO: add support for address space 5 and scratch_load_lds.
+class AMDGPULoadToLDS :
+ ClangBuiltin<"__builtin_amdgcn_load_to_lds">,
+ Intrinsic <
+ [],
+ [llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
+ LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
+ llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
+ llvm_i32_ty, // imm offset (applied to both input and LDS address)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+ // bit 1 = sc1,
+ // bit 4 = scc))
+ [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]>;
+def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
+
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2fa03e3964207..907b5b7e705d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
return selectBufferLoadLds(I);
+ // Until we can store both the address space of the global and the LDS
+ // arguments by having tto MachineMemOperands on an intrinsic, we just trust
+ // that the argument is a global pointer (buffer pointers have been handled by
+ // a LLVM IR-level lowering).
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
return selectGlobalLoadLds(I);
case Intrinsic::amdgcn_exp_compr:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 7163ad2aa7dca..f86aafdf08f9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_load_to_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_load_to_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1d0e81db5a5db..6085c8d584af2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 6); // soffset
return;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5273,6 +5274,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c05ba42d999e9..c686bb00bc286 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1450,6 +1450,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
@@ -1531,6 +1532,7 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
Ptr = II->getArgOperand(0);
break;
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
Ptr = II->getArgOperand(1);
break;
@@ -10219,6 +10221,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(Load, 0);
}
+ // Buffers are handled by LowerBufferFatPointers, and we're going to go
+ // for "trust me" that the remaining cases are global pointers until
+ // such time as we can put two mem operands on an intrinsic.
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
@@ -10249,7 +10255,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
break;
}
- auto *M = cast<MemSDNode>(Op);
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
SmallVector<SDValue, 6> Ops;
@@ -10289,6 +10294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
+ auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = Op->getConstantOperandVal(5);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
new file mode 100644
index 0000000000000..72ef6963c9976
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
+
+; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds
+
+; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds),
+
+;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to
+;; the correct intrinsic.
+
+declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+;---------------------------------------------------------------------y
+; dwordx3
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
+; GFX950-NEXT: s_endpgm
+ call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX950-NEXT: s_mov_b32 m0, s5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
+; GFX950-NEXT: s_endpgm
+ %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+ call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+ ret void
+}
+
+;---------------------------------------------------------------------
+; dwordx4
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) {
+; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_mov_b32 m0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
+; GFX950-NEXT: s_endpgm
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr...
[truncated]
|
High level question: I don't understand why you call this a "gather" operation. What do you mean by that? Isn't it semantically just a memcpy, or a (global/buffer) load followed by a (LDS) store? |
This is more like a subgroup operation because the destination base offset is uniform. |
The semantics of this operation (at least in the pre-gfx950 cases) are
Note that your lane-varying global address can point all over memory, but that the values to written to LDS always go at base, base + 4 bytes, base + 8 bytes, ... base + (wavesize - 1) * 4 bytes From where I'm standing, this is a gather |
Can you please document it in the AMDGPUUsage table as well? |
I see. The LDS part is doing "addtid" addressing. There are other instructions that do this like I think we could add the codegen support just by pattern-matching the address, so Then buffer-load-to-lds could be pattern-matched as a regular (fat pointer) buffer load followed by an addtid-style LDS store, right? So no intrinsic is really needed? |
@jayfoad I still think we need an intrinsic here because a load + an addtid store can be scheduled much different from the asynchronous "gather to LDS" - and because we don't want this load/store to not be optimized |
IMO the intrinsic should only be added as a last resort if we really can't get the pattern based codegen to work well enough. |
Well, if y'all want to go add a pattern for this and eventually deprecate the intrinsics I'm all ears, but we're trying to use these instructions now |
Beg to differ in particularly this case. In downstream application, I want to fine control to use this particular instruction so this gets propagated down to LLVM IR, without being changed or modified along the way. Well, actual reason: we need this instruction now. :-p |
99dc86e
to
5afd162
Compare
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane. | ||
LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform. | ||
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) | ||
llvm_i32_ty, // imm offset (applied to both input and LDS address) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the point of the imm offset argument? It's not semantically useful, right? You could just add this offset to the two pointer arguments before calling the intrinsic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm sure the existing intrinsics expose it for a reason - probably because there isn't pattern-matching to strip such an offset
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
... Oh. Consider the case that you have global p + N and LDS q + N. Then the LDS combiner can rewrite this to (q' + O) + N, aka q' + (O + N).
Then the two pointers won't have the same offset anymore and so it's unclear if you can slide it onto the instruction immediate
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
???
Matching addressing modes is part of the compiler's job. If you want absolute control over what goes in the immediate offset field you can write assembler!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lack of pattern matching isn't a reason to have the offset. There should be offset pattern matching regardless. I thought the argument for the current buffer intrinsic offset argument was something about the unreasonable bounds checking behaviors
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So @shiltian for reasons I may not be aware of that that's there
(The buffer intrinsic's soffset
bit is probably a case where they may have been incorrect bounds checking at some point, and is unrelated, I think)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The global one definitely shouldn't have the offset (given it's there, we should be trying to do addressing mode folding into it)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
big shrug
Is this something y'all want to try and fix here?
(Also, procedural history on the buffer intrinsics points me to https://reviews.llvm.org/D124884 ( @rampitec ), which leaves it rather unclear why the immarg was added ... I think it's because for LDS, unlike the other buffer instructions, you can't do voffset => (actual voffset + imm))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, it was added in https://reviews.llvm.org/D125279.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, thanks for finding the context! git blame
failed me. So ... we're having the discussion from that thread again, and therefore I'd like to appeal to precedent in the short term (regarding the immoffset parameter) in the interests of making some sort of progress.
If we ever fix the immoffset issue, upgrading into making the immoffset a constant 0 and adding it to both pointers should be fine? But that'd require a sufficiently robust pattern match, which I'm not sure we're convinced of
Re discussion on the other PR about "why is this even an intrinsic" - since this probably shouldn't just be in @jayfoad's DMs: The reason I disagree with "just pattern-match it" is that you can't get the scheduling you want without a guarantee of the intrinssic Namely, while
is obviously
if we turn that first example into
for example, we really don't want that match firing because LDS gets overridden. ... unless we're double-buffering into LDS and so trying to do
where, if the pattern match for the addtid load fails, say by waitcnt insertion, that'll cause proglems for the program Not to mention, because we don't have an intrinsic for ds_addtid, and because there are a lot of ways to spell the lane ID (mbcnt, workitem.id.x with annotations, a bunch of workitem IDs mod 64, etc etc), that'll be quite fragile So in the context of GEMM stuff, I'd rather not have this at "hope the compiler recognizes what we're trying to do". If the compiler can be made to recognize what we're trying to do reliably in the future, that'll be cool, but I can't be the one to write that patch and I don't think there's infinite bandwidth among the AMDGPU crowd for this improvement either |
Ping |
2 similar comments
Ping |
Ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me try to roll the ball here. I think most of the concerns have been addressed, and the PR looks to be in reasonably good shape. It looks good to me. Maybe give it a bit more time in case others have additional comments.
I think we'd be better off omitting the offset from the intrinsic signature. It won't play nicely with other places we try to handle addressing modes, and introduces potential subtarget dependence on the range of supported values |
@arsenm You're right that it might be better to emit the offset, but all the existing intrinsics that I'm abstracting over do have such a field. If you want to add new intrinsics that don't have the offset and that pattern-match instead, I'd be more than happy to review that - or if we want to break back-compat by getting rid of the offset. |
I don't think we need to worry about compatibility with an intrinsic that's been committed for a day |
|
I think we could do with an additional overload here. Currently a bunch of code (notably CK but probably elsewhere) uses the v4i32 version of the LDS intrinsics. I think this patch lets one use the addrspace(7) pointer of 128 bits alternative. So callers could transform the v4i32 into an addrspace(7) and then call this. It's not very clear from the backend docs how this stuff is supposed to be wired up by the user. Possibly bitcast from the 4i32 into an addrspace(8) annotated i128, and then addrspacecast to 7 to provide an extra 32 bits of zero, and then onward to this builtin? Whatever the proper sequence might be, adding an overload which takes a v4i32 and does the conversion is likely to improve adoption for the new builtin. |
You're not supposed to use the v4i32 versions of the intrinsics in general Instead of a v4i32, you should create a If you want to use a buffer resource like a normal pointer, you can create a But in general, the v4i32 forms of the buffer intrinsics are deprecated in favor of |
Right, but for code that already uses v4i32, given it has the same semantic content as the new and improved modelling, how does one turn a v4i32 into one of the new things? Pull the vector apart and pass as separate arguments to __builtin_amdgcn_make_buffer_rsrc? And given that one can turn the v4i32 into the new thing and pass it into this builtin, how about we let this overloaded builtin accept it directly? |
You can bitcast a v4i32 into a From there, you'd want to call the Getting a |
... Yeah, I'd pull the vector apart and call make.buffer.rsrc . Or, ideally, find the code that creates the v4i32 and have it call make.buffer.rsrc instead. (Misread your question) |
So given a v4i32, bitcast to i128. Then inttoptr to addrspace(7). Then call amdgns_load_to_lds on it. With the conversion to addrspacecast 8 unrelated to the problem at hand, since this builtin doesn't take it anyway, so we can ignore that. (that means addrspacecast 7-> 8 is not invertible by 8-> 7, right? it would discard some bits, in invisible breakage sort of way? is there an RFC for that design?) I think it would be a kind, friendly to existing library developers thing to do to add an overload to this builtin so that they can pass the v4i32 they're already using into it. That would provide a small step in the direction of increased portability. Are you/Matt strongly opposed to that as a feature? |
@JonChesterfield This builtin, semantically, cannot accommodate the v4i32 usage When you have a v4i32, you need to also specify, as an additional argument, the |
(You'll note that in https://github.com/llvm/llvm-project/pull/137425/files#diff-f904f8cd236733212015dd1988ffefcc9f79f7484ee46e3e3833d2d75fa69542R2243 , this intrinsic gets lowered to |
I'm not aware of anything requiring addrspacecast to be invertible? (In specific, cast 7 -> 8 isn't a thing at the moment) |
## Summary This PR sets the foundation for using `global_load_lds` instruction to load values from global to LDS memory. The pipeline is as follows: * Only convert `linalg.copy` emitted in `PromoteGPUMatMulOperands`. When it sees fit, insert a different attribute (`#iree_gpu.use_global_load_dma`) to `linalg.copy` to tag it along the pipeline. * Tagged `linalg.copy` will not be decomposed/tiled until bufferization. * after distributed to threads and bufferization, the tagged `linalg.copy` will then be lowered to a sequence of code responsible for subgroup-coalesced loading op `iree_gpu.global_load_dma`. * `iree_gpu.global_load_dma` will be mapped to `amdgpu.gather_to_lds` op, which will mapped to corresponding rocdl op. * Disable padding to reduce bank conflict pass because the destination workgroup memory has to be contiguous. ## Lowering `linalg.copy` After bufferization and distribute to threads, tagged `linalg.copy` still exists in the IR: ``` linalg.copy {lowering_config = #iree_gpu.use_global_load_dma} ins(%subview_12 : memref<64x128xi8, strided<[256, 1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>) outs(%alloc_4 : memref<64x128xi8, #gpu.address_space<workgroup>>) ``` Note that this `linalg.copy` is kept in the thread's code. The op itself is then converted into a `for loop`, in which subgroup of threads loads coalesced chunk of values. For example, assume there are N subgroups loading from `tensor<a x b x c>`: * then `i`-th subgruop will load a sub tensor of size `[a/N, b, c]`, so each slice is consecutive. * At this moment, assume row-major, and only tile the outermost dim. * The reason right now we are only dealing with `linalg.copy` emitted by `GPUPromoteMatmulOperands` is that we know the destination is allocated contiguously. * TODO: expand to any memref slices. * given `gpu.subgroup_id` and `gpu.lane_id`, each thread calculates the consecutive data chunk the subgroup the thread belongs to is responsible to load: * the chunk indices is the delinearized indices of the input tensor, from: * `affine.delinearize_index[gpu.subgroup_id * (num_elems_of(tensor) / num_subgroups)]`, to * `affine.delinearize_index[(gpu.subgroup_id + 1) * (num_elems_of(tensor) / num_subgroups) - 1]` * Assume each subgroup will load `n` values from linearized index `[N_f, N_b]`, then thread with lane id `i` will try to load: `iter = 0 to n : N_f + subgroup_size * iter + (i - 1)` . Then it will be converted to something like the following (in the example, assume `workgroup size = 256`, `subgroup_size = 64`, loading `64x128xi8`): ```miler scf.for %indvar = %c0 to %c32 step %c1 { ;; thread-specific gathering address from global address %17 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 2048 + s2 * 64)>()[%lane_id, %subgroup_id, %indvar] %18:2 = affine.delinearize_index %17 into (128, 64) : index, index ;; this iteration's base storing index %19 = affine.apply affine_map<()[s0, s1] -> (s0 * 2048 + s1 * 64)>()[%subgroup_id, %indvar] %20:2 = affine.delinearize_index %19 into (128, 64) : index, index iree_gpu.global_load_dma %subview_13[%18#0, %18#1] -> %alloc_5[%20#0, %20#1] : memref<128x64xi8, strided<[256, 1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>> -> memref<128x64xi8, #gpu.address_space<workgroup>> } ;; if there are residual elements (subgroup_copy_region_size % subgroup_size != 0), copy residual elements here gpu.barrier ``` ## Dependent PRs: * design doc: https://hackmd.io/N0RitxPzT9GPhM0jEPtOCg?view * upstream changes required: * llvm/llvm-project#133498 * llvm/llvm-project#136405 * llvm/llvm-project#137671 * llvm/llvm-project#137425 * #20800 (review) --------- Signed-off-by: Alan Li <[email protected]>
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the same API and "gather from a pointer to LDS" is something of an abstract operation. This commit adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang.
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the same API and "gather from a pointer to LDS" is something of an abstract operation.
This commit adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features).
It also plumbs the intrinsic through to clang.