Skip to content

Commit d53c6cd

Browse files
authored
[AMDGPU][Clang] Builtin for GLOBAL_LOAD_LDS on GFX940 (#92962)
Fixes: SWDEV-459212
1 parent 64f6406 commit d53c6cd

File tree

3 files changed

+67
-17
lines changed

3 files changed

+67
-17
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
240240
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
241241
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
242242
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
243+
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts")
243244

244245
//===----------------------------------------------------------------------===//
245246
// Deep learning builtins.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s
3+
// REQUIRES: amdgpu-registered-target
4+
5+
typedef unsigned int u32;
6+
typedef unsigned short u16;
7+
typedef unsigned char u8;
8+
9+
// CHECK-LABEL: @test_global_load_lds_u32(
10+
// CHECK-NEXT: entry:
11+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
12+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
13+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
14+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
15+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
16+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
17+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
18+
// CHECK-NEXT: ret void
19+
//
20+
void test_global_load_lds_u32(global u32* src, local u32 *dst) {
21+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
22+
}
23+
24+
// CHECK-LABEL: @test_global_load_lds_u16(
25+
// CHECK-NEXT: entry:
26+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
27+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
28+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
29+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
30+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
31+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
32+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
33+
// CHECK-NEXT: ret void
34+
//
35+
void test_global_load_lds_u16(global u16* src, local u16 *dst) {
36+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
37+
}
38+
39+
// CHECK-LABEL: @test_global_load_lds_u8(
40+
// CHECK-NEXT: entry:
41+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
42+
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
43+
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
44+
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
45+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
46+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
47+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
48+
// CHECK-NEXT: ret void
49+
//
50+
void test_global_load_lds_u8(global u8* src, local u8 *dst) {
51+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
52+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2466,23 +2466,20 @@ def int_amdgcn_perm :
24662466
// GFX9 Intrinsics
24672467
//===----------------------------------------------------------------------===//
24682468

2469-
class AMDGPUGlobalLoadLDS : Intrinsic <
2470-
[],
2471-
[LLVMQualPointerType<1>, // Base global pointer to load from
2472-
LLVMQualPointerType<3>, // LDS base pointer to store to
2473-
llvm_i32_ty, // Data byte size: 1/2/4
2474-
llvm_i32_ty, // imm offset (applied to both global and LDS address)
2475-
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
2476-
// bit 1 = slc/sc1,
2477-
// bit 2 = dlc on gfx10/gfx11))
2478-
// bit 4 = scc/nt on gfx90a+))
2479-
// gfx12+:
2480-
// cachepolicy (bits [0-2] = th,
2481-
// bits [3-4] = scope)
2482-
// swizzled buffer (bit 6 = swz),
2483-
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2484-
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2485-
"", [SDNPMemOperand]>;
2469+
class AMDGPUGlobalLoadLDS :
2470+
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
2471+
Intrinsic <
2472+
[],
2473+
[LLVMQualPointerType<1>, // Base global pointer to load from
2474+
LLVMQualPointerType<3>, // LDS base pointer to store to
2475+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
2476+
llvm_i32_ty, // imm offset (applied to both global and LDS address)
2477+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2478+
// bit 1 = sc1,
2479+
// bit 4 = scc))
2480+
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2481+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2482+
"", [SDNPMemOperand]>;
24862483
def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
24872484

24882485
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)