Skip to content

[AMDGPU] Add a new amdgcn.load.to.lds intrinsic #137425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")

//===----------------------------------------------------------------------===//
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repeating comment from the PR that duplicated this one, yes but that's a backend problem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be a backend problem

However,

Address Dest = EmitPointerWithAlignment(E->getArg(0));
is the handling for memcpy(), and doesn't seem to rely on the backend handling it. I don't know if this same handling should be happening for the intrinsic

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Bumping this comment)

return emitBuiltinWithOneOverloadedType<5>(*this, E,
Intrinsic::amdgcn_load_to_lds);
}
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Sema/SemaAMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,

switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
case AMDGPU::BI__builtin_amdgcn_load_to_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
Expand Down
67 changes: 67 additions & 0 deletions clang/test/CodeGenHIP/amdgpu-load-to-lds.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s

// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP

#define __device__ __attribute__((device))
#define __shared__ __attribute__((shared))

typedef unsigned int u32;

// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_u32PjS_(
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
// CHECK-NEXT: ret void
//
__device__ void test_load_to_lds_u32(u32* src, __shared__ u32 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
}

// CHECK-LABEL: define dso_local void @_Z30test_load_to_lds_u32_flat_destPjS_(
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
// CHECK-NEXT: ret void
//
__device__ void test_load_to_lds_u32_flat_dest(u32* src, u32 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
}

// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_128PvS_(
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 16, i32 0, i32 0)
// CHECK-NEXT: ret void
//
__device__ void test_load_to_lds_128(void* src, __shared__ void *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
}
30 changes: 30 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
}

// CHECK-LABEL: @test_load_to_lds_96(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_96(global void* src, local void *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
}

// CHECK-LABEL: @test_load_to_lds_128(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_128(global void* src, local void *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
}

// CHECK-LABEL: @test_global_load_lds_96(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
60 changes: 60 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
// REQUIRES: amdgpu-registered-target

typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;

// CHECK-LABEL: @test_load_to_lds_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_u32(global u32* src, local u32 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
}

// CHECK-LABEL: @test_load_to_lds_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_u16(global u16* src, local u16 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
}

// CHECK-LABEL: @test_load_to_lds_u8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_u8(global u8* src, local u8 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
}
25 changes: 25 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s
// REQUIRES: amdgpu-registered-target

typedef unsigned int u32;

void test_load_to_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
__builtin_amdgcn_load_to_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
}

__attribute__((target("gfx950-insts")))
void test_load_to_lds_via_target_feature(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
}
10 changes: 9 additions & 1 deletion llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.

:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.

llvm.amdgcn.load.to.lds.p<1/7> Loads values from global memory (either in the form of a global
a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2,
or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer
argument should be wavefront-uniform; the global pointer need not be.
The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes
and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`,
`buffer_load_* ... lds`, or `global_load__* ... lds` depnedening on address
space and architecture. `amdgcn.global.load.lds` has the same semantics as
`amdgcn.load.to.lds.p1`.
llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
the lowest active lane of the input operand. Currently implemented
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
Expand Down
8 changes: 8 additions & 0 deletions llvm/docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ Changes to the AMDGPU Backend

* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.

* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
represented in the IR without needing to use buffer resource intrinsics directly.
This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
optional, and there are no plans to deprecate `amdgcn.global.load.lds`.

Changes to the ARM Backend
--------------------------

Expand Down
22 changes: 22 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2641,6 +2641,28 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//

/// This is a general-purpose intrinsic for all operations that take a pointer
/// a base location in LDS, and a data size and use it to perform a gather to LDS.
/// This allows abstracting over both global pointers (address space 1) and
/// the buffer-resource-wrapper pointers (address space 7 and 9).
/// TODO: add support for address space 5 and scratch_load_lds.
class AMDGPULoadToLDS :
Intrinsic <
[],
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // imm offset (applied to both input and LDS address)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point of the imm offset argument? It's not semantically useful, right? You could just add this offset to the two pointer arguments before calling the intrinsic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sure the existing intrinsics expose it for a reason - probably because there isn't pattern-matching to strip such an offset

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... Oh. Consider the case that you have global p + N and LDS q + N. Then the LDS combiner can rewrite this to (q' + O) + N, aka q' + (O + N).

Then the two pointers won't have the same offset anymore and so it's unclear if you can slide it onto the instruction immediate

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

???

Matching addressing modes is part of the compiler's job. If you want absolute control over what goes in the immediate offset field you can write assembler!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lack of pattern matching isn't a reason to have the offset. There should be offset pattern matching regardless. I thought the argument for the current buffer intrinsic offset argument was something about the unreasonable bounds checking behaviors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So @shiltian for reasons I may not be aware of that that's there

(The buffer intrinsic's soffset bit is probably a case where they may have been incorrect bounds checking at some point, and is unrelated, I think)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The global one definitely shouldn't have the offset (given it's there, we should be trying to do addressing mode folding into it)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

big shrug

Is this something y'all want to try and fix here?

(Also, procedural history on the buffer intrinsics points me to https://reviews.llvm.org/D124884 ( @rampitec ), which leaves it rather unclear why the immarg was added ... I think it's because for LDS, unlike the other buffer instructions, you can't do voffset => (actual voffset + imm))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, it was added in https://reviews.llvm.org/D125279.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, thanks for finding the context! git blame failed me. So ... we're having the discussion from that thread again, and therefore I'd like to appeal to precedent in the short term (regarding the immoffset parameter) in the interests of making some sort of progress.

If we ever fix the immoffset issue, upgrading into making the immoffset a constant 0 and adding it to both pointers should be fine? But that'd require a sufficiently robust pattern match, which I'm not sure we're convinced of

llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
// bit 4 = scc))
[IntrWillReturn, IntrArgMemOnly,
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;

class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
return selectBufferLoadLds(I);
// Until we can store both the address space of the global and the LDS
// arguments by having tto MachineMemOperands on an intrinsic, we just trust
// that the argument is a global pointer (buffer pointers have been handled by
// a LLVM IR-level lowering).
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
return selectGlobalLoadLds(I);
case Intrinsic::amdgcn_exp_compr:
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2151,6 +2151,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
case Intrinsic::amdgcn_load_to_lds:
return true;
}
}
Expand Down Expand Up @@ -2239,6 +2240,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
case Intrinsic::amdgcn_load_to_lds: {
Value *Ptr = I.getArgOperand(0);
if (!isSplitFatPtr(Ptr->getType()))
return {nullptr, nullptr};
IRB.SetInsertPoint(&I);
auto [Rsrc, Off] = getPtrParts(Ptr);
Value *LDSPtr = I.getArgOperand(1);
Value *LoadSize = I.getArgOperand(2);
Value *ImmOff = I.getArgOperand(3);
Value *Aux = I.getArgOperand(4);
Value *SOffset = IRB.getInt32(0);
Instruction *NewLoad = IRB.CreateIntrinsic(
Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
{Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
copyMetadata(NewLoad, &I);
SplitUsers.insert(&I);
I.replaceAllUsesWith(NewLoad);
return {nullptr, nullptr};
}
}
return {nullptr, nullptr};
}
Expand Down
Loading