Skip to content

Commit 280d90d

Browse files
authored
AMDGPU: Add back half and bfloat support for global_load_tr16 pats (#99540)
half and bfloat are common types for 16-bit elements. The support of them was original there and dropped due to some reasons. This work adds the support of the float types back.
1 parent 5431a31 commit 280d90d

11 files changed

+207
-14
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,12 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
445445

446446
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
447447
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
448+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
449+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
448450
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
449451
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
452+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
453+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
450454

451455
//===----------------------------------------------------------------------===//
452456
// WMMA builtins.

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18725,7 +18725,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1872518725
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1872618726
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1872718727
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
18728-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
18728+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
18729+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
18730+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
18731+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
18732+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {
1872918733

1873018734
Intrinsic::ID IID;
1873118735
switch (BuiltinID) {
@@ -18734,7 +18738,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1873418738
IID = Intrinsic::amdgcn_global_load_tr_b64;
1873518739
break;
1873618740
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
18741+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
18742+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
1873718743
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
18744+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
18745+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
1873818746
IID = Intrinsic::amdgcn_global_load_tr_b128;
1873918747
break;
1874018748
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,22 @@
55

66
typedef int v2i __attribute__((ext_vector_type(2)));
77
typedef short v8s __attribute__((ext_vector_type(8)));
8+
typedef half v8h __attribute__((ext_vector_type(8)));
9+
typedef __bf16 v8y __attribute__((ext_vector_type(8)));
810
typedef short v4s __attribute__((ext_vector_type(4)));
11+
typedef half v4h __attribute__((ext_vector_type(4)));
12+
typedef __bf16 v4y __attribute__((ext_vector_type(4)));
913

10-
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr)
14+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr,
15+
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr)
1116
{
1217
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
1318
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
19+
v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
20+
v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
1421

15-
int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
16-
v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
22+
int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
23+
v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
24+
v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
25+
v4y out_8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
1726
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
// REQUIRES: amdgpu-registered-target
55

66
typedef short v4s __attribute__((ext_vector_type(4)));
7+
typedef half v4h __attribute__((ext_vector_type(4)));
8+
typedef __bf16 v4y __attribute__((ext_vector_type(4)));
79

8-
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr)
10+
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr)
911
{
1012
int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
1113
v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
14+
v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
15+
v4y out_4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
1216
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,13 @@
55

66
typedef int v2i __attribute__((ext_vector_type(2)));
77
typedef short v8s __attribute__((ext_vector_type(8)));
8+
typedef half v8h __attribute__((ext_vector_type(8)));
9+
typedef __bf16 v8y __attribute__((ext_vector_type(8)));
810

9-
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr)
11+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr)
1012
{
1113
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
1214
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
15+
v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
16+
v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
1317
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
typedef int v2i __attribute__((ext_vector_type(2)));
66
typedef short v8s __attribute__((ext_vector_type(8)));
7+
typedef half v8h __attribute__((ext_vector_type(8)));
8+
typedef __bf16 v8y __attribute__((ext_vector_type(8)));
79

810
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
911
// CHECK-GFX1200-NEXT: entry:
@@ -24,3 +26,23 @@ v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
2426
{
2527
return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
2628
}
29+
30+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
31+
// CHECK-GFX1200-NEXT: entry:
32+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
33+
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
34+
//
35+
v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
36+
{
37+
return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
38+
}
39+
40+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
41+
// CHECK-GFX1200-NEXT: entry:
42+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
43+
// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]]
44+
//
45+
v8y test_amdgcn_global_load_tr_b128_v8bf16(global v8y* inptr)
46+
{
47+
return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
48+
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
44

55
typedef short v4s __attribute__((ext_vector_type(4)));
6+
typedef half v4h __attribute__((ext_vector_type(4)));
7+
typedef __bf16 v4y __attribute__((ext_vector_type(4)));
68

79
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
810
// CHECK-GFX1200-NEXT: entry:
@@ -23,3 +25,23 @@ v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
2325
{
2426
return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
2527
}
28+
29+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
30+
// CHECK-GFX1200-NEXT: entry:
31+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]])
32+
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
33+
//
34+
v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
35+
{
36+
return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
37+
}
38+
39+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
40+
// CHECK-GFX1200-NEXT: entry:
41+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
42+
// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]]
43+
//
44+
v4y test_amdgcn_global_load_tr_b128_v4bf16(global v4y* inptr)
45+
{
46+
return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
47+
}

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,10 +1590,14 @@ let OtherPredicates = [isGFX12Plus] in {
15901590
let WaveSizePredicate = isWave32 in {
15911591
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
15921592
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
1593+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>;
1594+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8bf16>;
15931595
}
15941596
let WaveSizePredicate = isWave64 in {
15951597
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
15961598
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>;
1599+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>;
1600+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4bf16>;
15971601
}
15981602
}
15991603

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,22 @@ bb:
213213
ret void
214214
}
215215

216+
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr)
217+
define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
218+
bb:
219+
%tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr)
220+
store <8 x half> %tmp0, ptr addrspace(1) %out, align 16
221+
ret void
222+
}
223+
224+
; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr)
225+
define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
226+
bb:
227+
%tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr)
228+
store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
229+
ret void
230+
}
231+
216232
; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr)
217233
define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
218234
bb:
@@ -229,6 +245,22 @@ bb:
229245
ret void
230246
}
231247

248+
; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr)
249+
define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
250+
bb:
251+
%tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr)
252+
store <4 x half> %tmp0, ptr addrspace(1) %out, align 8
253+
ret void
254+
}
255+
256+
; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr)
257+
define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
258+
bb:
259+
%tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr)
260+
store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8
261+
ret void
262+
}
263+
232264
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
233265
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
234266
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
@@ -258,8 +290,12 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32
258290

259291
declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
260292
declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
293+
declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1))
294+
declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1))
261295
declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1))
262296
declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
297+
declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
298+
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
263299

264300
attributes #0 = { nounwind convergent }
265301
attributes #1 = { nounwind readnone convergent }

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44

55
declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
66
declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
7+
declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1))
8+
declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1))
79

8-
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
9-
; GFX12-LABEL: global_load_tr_b64:
10+
define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
11+
; GFX12-LABEL: global_load_tr_b64_v2i32:
1012
; GFX12: ; %bb.0: ; %entry
1113
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1214
; GFX12-NEXT: v_mov_b32_e32 v2, 0
@@ -24,8 +26,8 @@ entry:
2426
ret void
2527
}
2628

27-
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
28-
; GFX12-LABEL: global_load_tr_b128:
29+
define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
30+
; GFX12-LABEL: global_load_tr_b128_v8i16:
2931
; GFX12: ; %bb.0: ; %entry
3032
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
3133
; GFX12-NEXT: v_mov_b32_e32 v4, 0
@@ -42,3 +44,41 @@ entry:
4244
store <8 x i16> %val, ptr addrspace(1) %use
4345
ret void
4446
}
47+
48+
define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
49+
; GFX12-LABEL: global_load_tr_b128_v8f16:
50+
; GFX12: ; %bb.0: ; %entry
51+
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
52+
; GFX12-NEXT: v_mov_b32_e32 v4, 0
53+
; GFX12-NEXT: s_wait_kmcnt 0x0
54+
; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
55+
; GFX12-NEXT: s_wait_loadcnt 0x0
56+
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3]
57+
; GFX12-NEXT: s_nop 0
58+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59+
; GFX12-NEXT: s_endpgm
60+
entry:
61+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
62+
%val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
63+
store <8 x half> %val, ptr addrspace(1) %use
64+
ret void
65+
}
66+
67+
define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
68+
; GFX12-LABEL: global_load_tr_b128_v8bf16:
69+
; GFX12: ; %bb.0: ; %entry
70+
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
71+
; GFX12-NEXT: v_mov_b32_e32 v4, 0
72+
; GFX12-NEXT: s_wait_kmcnt 0x0
73+
; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
74+
; GFX12-NEXT: s_wait_loadcnt 0x0
75+
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3]
76+
; GFX12-NEXT: s_nop 0
77+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
78+
; GFX12-NEXT: s_endpgm
79+
entry:
80+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
81+
%val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep)
82+
store <8 x bfloat> %val, ptr addrspace(1) %use
83+
ret void
84+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44

55
declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
66
declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
7+
declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1))
8+
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1))
79

8-
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
9-
; GFX12-LABEL: global_load_tr_b64:
10+
define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
11+
; GFX12-LABEL: global_load_tr_b64_i32:
1012
; GFX12: ; %bb.0: ; %entry
1113
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1214
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -24,8 +26,8 @@ entry:
2426
ret void
2527
}
2628

27-
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
28-
; GFX12-LABEL: global_load_tr_b128:
29+
define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
30+
; GFX12-LABEL: global_load_tr_b128_v4i16:
2931
; GFX12: ; %bb.0: ; %entry
3032
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
3133
; GFX12-NEXT: v_mov_b32_e32 v2, 0
@@ -42,3 +44,41 @@ entry:
4244
store <4 x i16> %val, ptr addrspace(1) %use
4345
ret void
4446
}
47+
48+
define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
49+
; GFX12-LABEL: global_load_tr_b128_v4f16:
50+
; GFX12: ; %bb.0: ; %entry
51+
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
52+
; GFX12-NEXT: v_mov_b32_e32 v2, 0
53+
; GFX12-NEXT: s_wait_kmcnt 0x0
54+
; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
55+
; GFX12-NEXT: s_wait_loadcnt 0x0
56+
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
57+
; GFX12-NEXT: s_nop 0
58+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59+
; GFX12-NEXT: s_endpgm
60+
entry:
61+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
62+
%val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep)
63+
store <4 x half> %val, ptr addrspace(1) %use
64+
ret void
65+
}
66+
67+
define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
68+
; GFX12-LABEL: global_load_tr_b128_v4bf16:
69+
; GFX12: ; %bb.0: ; %entry
70+
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
71+
; GFX12-NEXT: v_mov_b32_e32 v2, 0
72+
; GFX12-NEXT: s_wait_kmcnt 0x0
73+
; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
74+
; GFX12-NEXT: s_wait_loadcnt 0x0
75+
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
76+
; GFX12-NEXT: s_nop 0
77+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
78+
; GFX12-NEXT: s_endpgm
79+
entry:
80+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
81+
%val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1) %gep)
82+
store <4 x bfloat> %val, ptr addrspace(1) %use
83+
ret void
84+
}

0 commit comments

Comments
 (0)