Skip to content

Commit 4bdd116

Browse files
authored
[AMDGPU] Add a new amdgcn.load.to.lds intrinsic (#137425)
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the same API and "gather from a pointer to LDS" is something of an abstract operation. This commit adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang.
1 parent 2c6b239 commit 4bdd116

File tree

26 files changed

+787
-54
lines changed

26 files changed

+787
-54
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
257257
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
258258
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
259259
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
260+
TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
260261
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
261262

262263
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
564564
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
565565
return Builder.CreateCall(F, {Addr});
566566
}
567+
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
568+
// Should this have asan instrumentation?
569+
return emitBuiltinWithOneOverloadedType<5>(*this, E,
570+
Intrinsic::amdgcn_load_to_lds);
571+
}
567572
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
568573
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
569574
{llvm::Type::getInt64Ty(getLLVMContext())});

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
3636

3737
switch (BuiltinID) {
3838
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
39+
case AMDGPU::BI__builtin_amdgcn_load_to_lds:
3940
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
4041
constexpr const int SizeIdx = 2;
4142
llvm::APSInt Size;
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
4+
5+
// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP
6+
7+
#define __device__ __attribute__((device))
8+
#define __shared__ __attribute__((shared))
9+
10+
typedef unsigned int u32;
11+
12+
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_u32PjS_(
13+
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
14+
// CHECK-NEXT: [[ENTRY:.*:]]
15+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
16+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
17+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
18+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
19+
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
20+
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
21+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
22+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
23+
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
24+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
25+
// CHECK-NEXT: ret void
26+
//
27+
__device__ void test_load_to_lds_u32(u32* src, __shared__ u32 *dst) {
28+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
29+
}
30+
31+
// CHECK-LABEL: define dso_local void @_Z30test_load_to_lds_u32_flat_destPjS_(
32+
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
35+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
36+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
37+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
38+
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
39+
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
40+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
41+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
42+
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
43+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
44+
// CHECK-NEXT: ret void
45+
//
46+
__device__ void test_load_to_lds_u32_flat_dest(u32* src, u32 *dst) {
47+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
48+
}
49+
50+
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_128PvS_(
51+
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
52+
// CHECK-NEXT: [[ENTRY:.*:]]
53+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
54+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
55+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
56+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
57+
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
58+
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
59+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
60+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
61+
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
62+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 16, i32 0, i32 0)
63+
// CHECK-NEXT: ret void
64+
//
65+
__device__ void test_load_to_lds_128(void* src, __shared__ void *dst) {
66+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
67+
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,6 +1764,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
17641764
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
17651765
}
17661766

1767+
// CHECK-LABEL: @test_load_to_lds_96(
1768+
// CHECK-NEXT: entry:
1769+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1770+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1771+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1772+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1773+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1774+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1775+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
1776+
// CHECK-NEXT: ret void
1777+
//
1778+
void test_load_to_lds_96(global void* src, local void *dst) {
1779+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
1780+
}
1781+
1782+
// CHECK-LABEL: @test_load_to_lds_128(
1783+
// CHECK-NEXT: entry:
1784+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1785+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1786+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1787+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1788+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1789+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1790+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
1791+
// CHECK-NEXT: ret void
1792+
//
1793+
void test_load_to_lds_128(global void* src, local void *dst) {
1794+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
1795+
}
1796+
17671797
// CHECK-LABEL: @test_global_load_lds_96(
17681798
// CHECK-NEXT: entry:
17691799
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
4+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
5+
// REQUIRES: amdgpu-registered-target
6+
7+
typedef unsigned int u32;
8+
typedef unsigned short u16;
9+
typedef unsigned char u8;
10+
11+
// CHECK-LABEL: @test_load_to_lds_u32(
12+
// CHECK-NEXT: entry:
13+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
14+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
15+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
16+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
17+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
18+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
19+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
20+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
21+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
22+
// CHECK-NEXT: ret void
23+
//
24+
void test_load_to_lds_u32(global u32* src, local u32 *dst) {
25+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
26+
}
27+
28+
// CHECK-LABEL: @test_load_to_lds_u16(
29+
// CHECK-NEXT: entry:
30+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
31+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
32+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
33+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
34+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
35+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
36+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
37+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
38+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
39+
// CHECK-NEXT: ret void
40+
//
41+
void test_load_to_lds_u16(global u16* src, local u16 *dst) {
42+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
43+
}
44+
45+
// CHECK-LABEL: @test_load_to_lds_u8(
46+
// CHECK-NEXT: entry:
47+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
48+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
49+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
50+
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
51+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
52+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
53+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
54+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
55+
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
56+
// CHECK-NEXT: ret void
57+
//
58+
void test_load_to_lds_u8(global u8* src, local u8 *dst) {
59+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
60+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s
4+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s
5+
// REQUIRES: amdgpu-registered-target
6+
7+
typedef unsigned int u32;
8+
9+
void test_load_to_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
10+
__builtin_amdgcn_load_to_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
11+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
12+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
13+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
14+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
15+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
16+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
17+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
18+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
19+
}
20+
21+
__attribute__((target("gfx950-insts")))
22+
void test_load_to_lds_via_target_feature(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
23+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
24+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
25+
}

llvm/docs/AMDGPUUsage.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
12161216
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
12171217

12181218
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
1219-
1219+
llvm.amdgcn.load.to.lds.p<1/7> Loads values from global memory (either in the form of a global
1220+
a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2,
1221+
or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer
1222+
argument should be wavefront-uniform; the global pointer need not be.
1223+
The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes
1224+
and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`,
1225+
`buffer_load_* ... lds`, or `global_load__* ... lds` depnedening on address
1226+
space and architecture. `amdgcn.global.load.lds` has the same semantics as
1227+
`amdgcn.load.to.lds.p1`.
12201228
llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
12211229
the lowest active lane of the input operand. Currently implemented
12221230
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,

llvm/docs/ReleaseNotes.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,14 @@ Changes to the AMDGPU Backend
110110

111111
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
112112

113+
* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
114+
intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
115+
(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
116+
represented in the IR without needing to use buffer resource intrinsics directly.
117+
This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
118+
buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
119+
optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
120+
113121
Changes to the ARM Backend
114122
--------------------------
115123

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2641,6 +2641,28 @@ def int_amdgcn_perm :
26412641
// GFX9 Intrinsics
26422642
//===----------------------------------------------------------------------===//
26432643

2644+
/// This is a general-purpose intrinsic for all operations that take a pointer
2645+
/// a base location in LDS, and a data size and use it to perform a gather to LDS.
2646+
/// This allows abstracting over both global pointers (address space 1) and
2647+
/// the buffer-resource-wrapper pointers (address space 7 and 9).
2648+
/// TODO: add support for address space 5 and scratch_load_lds.
2649+
class AMDGPULoadToLDS :
2650+
Intrinsic <
2651+
[],
2652+
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
2653+
LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
2654+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
2655+
llvm_i32_ty, // imm offset (applied to both input and LDS address)
2656+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2657+
// bit 1 = sc1,
2658+
// bit 4 = scc))
2659+
[IntrWillReturn, IntrArgMemOnly,
2660+
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
2661+
NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<1>>,
2662+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2663+
"", [SDNPMemOperand]>;
2664+
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
2665+
26442666
class AMDGPUGlobalLoadLDS :
26452667
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
26462668
Intrinsic <

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,6 +2329,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23292329
case Intrinsic::amdgcn_struct_buffer_load_lds:
23302330
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
23312331
return selectBufferLoadLds(I);
2332+
// Until we can store both the address space of the global and the LDS
2333+
// arguments by having tto MachineMemOperands on an intrinsic, we just trust
2334+
// that the argument is a global pointer (buffer pointers have been handled by
2335+
// a LLVM IR-level lowering).
2336+
case Intrinsic::amdgcn_load_to_lds:
23322337
case Intrinsic::amdgcn_global_load_lds:
23332338
return selectGlobalLoadLds(I);
23342339
case Intrinsic::amdgcn_exp_compr:

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2161,6 +2161,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
21612161
case Intrinsic::memset:
21622162
case Intrinsic::memset_inline:
21632163
case Intrinsic::experimental_memset_pattern:
2164+
case Intrinsic::amdgcn_load_to_lds:
21642165
return true;
21652166
}
21662167
}
@@ -2249,6 +2250,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
22492250
SplitUsers.insert(&I);
22502251
return {NewRsrc, Off};
22512252
}
2253+
case Intrinsic::amdgcn_load_to_lds: {
2254+
Value *Ptr = I.getArgOperand(0);
2255+
if (!isSplitFatPtr(Ptr->getType()))
2256+
return {nullptr, nullptr};
2257+
IRB.SetInsertPoint(&I);
2258+
auto [Rsrc, Off] = getPtrParts(Ptr);
2259+
Value *LDSPtr = I.getArgOperand(1);
2260+
Value *LoadSize = I.getArgOperand(2);
2261+
Value *ImmOff = I.getArgOperand(3);
2262+
Value *Aux = I.getArgOperand(4);
2263+
Value *SOffset = IRB.getInt32(0);
2264+
Instruction *NewLoad = IRB.CreateIntrinsic(
2265+
Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
2266+
{Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
2267+
copyMetadata(NewLoad, &I);
2268+
SplitUsers.insert(&I);
2269+
I.replaceAllUsesWith(NewLoad);
2270+
return {nullptr, nullptr};
2271+
}
22522272
}
22532273
return {nullptr, nullptr};
22542274
}

0 commit comments

Comments
 (0)