Skip to content

Commit 57f6a3f

Browse files
authored
[AMDGPU] Add global_load_tr for GFX12 (#77772)
Support new amdgcn_global_load_tr instructions for load with transpose. * MC layer support for GLOBAL_LOAD_TR_B64/GLOBAL_LOAD_TR_B128 * Intrinsic int_amdgcn_global_load_tr * Clang builtins amdgcn_global_load_tr*
1 parent 1566f1f commit 57f6a3f

16 files changed

+683
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,13 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
428428
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
429429
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
430430

431+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
432+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
433+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
434+
435+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
436+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
437+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
431438

432439
#undef BUILTIN
433440
#undef TARGET_BUILTIN

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18178,6 +18178,45 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1817818178
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
1817918179
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
1818018180
}
18181+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18182+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18183+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18184+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18185+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18186+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
18187+
18188+
llvm::Type *ArgTy;
18189+
switch (BuiltinID) {
18190+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18191+
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
18192+
break;
18193+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18194+
ArgTy = llvm::FixedVectorType::get(
18195+
llvm::Type::getInt32Ty(getLLVMContext()), 2);
18196+
break;
18197+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18198+
ArgTy = llvm::FixedVectorType::get(
18199+
llvm::Type::getHalfTy(getLLVMContext()), 4);
18200+
break;
18201+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18202+
ArgTy = llvm::FixedVectorType::get(
18203+
llvm::Type::getInt16Ty(getLLVMContext()), 4);
18204+
break;
18205+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18206+
ArgTy = llvm::FixedVectorType::get(
18207+
llvm::Type::getHalfTy(getLLVMContext()), 8);
18208+
break;
18209+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
18210+
ArgTy = llvm::FixedVectorType::get(
18211+
llvm::Type::getInt16Ty(getLLVMContext()), 8);
18212+
break;
18213+
}
18214+
18215+
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18216+
llvm::Function *F =
18217+
CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
18218+
return Builder.CreateCall(F, {Addr});
18219+
}
1818118220
case AMDGPU::BI__builtin_amdgcn_read_exec:
1818218221
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
1818318222
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm \
2+
// RUN: -verify -S -o - %s
3+
4+
// REQUIRES: amdgpu-registered-target
5+
6+
typedef int v2i __attribute__((ext_vector_type(2)));
7+
typedef half v8h __attribute__((ext_vector_type(8)));
8+
typedef short v8s __attribute__((ext_vector_type(8)));
9+
10+
typedef half v4h __attribute__((ext_vector_type(4)));
11+
typedef short v4s __attribute__((ext_vector_type(4)));
12+
13+
14+
15+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
16+
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
17+
{
18+
v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
19+
v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
20+
v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
21+
22+
int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
23+
v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
24+
v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
25+
}
26+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm \
2+
// RUN: -verify -S -o - %s
3+
4+
// REQUIRES: amdgpu-registered-target
5+
6+
typedef half v4h __attribute__((ext_vector_type(4)));
7+
typedef short v4s __attribute__((ext_vector_type(4)));
8+
9+
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
10+
{
11+
int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
12+
v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
13+
v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
14+
}
15+
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm \
2+
// RUN: -verify -S -o - %s
3+
4+
// REQUIRES: amdgpu-registered-target
5+
6+
typedef int v2i __attribute__((ext_vector_type(2)));
7+
typedef half v8h __attribute__((ext_vector_type(8)));
8+
typedef short v8s __attribute__((ext_vector_type(8)));
9+
10+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
11+
{
12+
v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
13+
v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
14+
v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
15+
}
16+
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
4+
5+
typedef int v2i __attribute__((ext_vector_type(2)));
6+
typedef half v8h __attribute__((ext_vector_type(8)));
7+
typedef short v8s __attribute__((ext_vector_type(8)));
8+
9+
// Wave32
10+
11+
//
12+
// amdgcn_global_load_tr
13+
//
14+
15+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
16+
// CHECK-GFX1200-NEXT: entry:
17+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
18+
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
19+
//
20+
v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
21+
{
22+
return __builtin_amdgcn_global_load_tr_v2i32(inptr);
23+
}
24+
25+
//
26+
// amdgcn_global_load_tr
27+
//
28+
29+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
30+
// CHECK-GFX1200-NEXT: entry:
31+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
32+
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
33+
//
34+
v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
35+
{
36+
return __builtin_amdgcn_global_load_tr_v8i16(inptr);
37+
}
38+
39+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
40+
// CHECK-GFX1200-NEXT: entry:
41+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
42+
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
43+
//
44+
v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
45+
{
46+
return __builtin_amdgcn_global_load_tr_v8f16(inptr);
47+
}
48+
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
4+
5+
typedef half v4h __attribute__((ext_vector_type(4)));
6+
typedef short v4s __attribute__((ext_vector_type(4)));
7+
8+
// Wave64
9+
10+
//
11+
// amdgcn_global_load_tr
12+
//
13+
14+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
15+
// CHECK-GFX1200-NEXT: entry:
16+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
17+
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
18+
//
19+
int test_amdgcn_global_load_tr_i32(global int* inptr)
20+
{
21+
return __builtin_amdgcn_global_load_tr_i32(inptr);
22+
}
23+
24+
//
25+
// amdgcn_global_load_tr
26+
//
27+
28+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
29+
// CHECK-GFX1200-NEXT: entry:
30+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
31+
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
32+
//
33+
v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
34+
{
35+
return __builtin_amdgcn_global_load_tr_v4i16(inptr);
36+
}
37+
38+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
39+
// CHECK-GFX1200-NEXT: entry:
40+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
41+
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
42+
//
43+
v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
44+
{
45+
return __builtin_amdgcn_global_load_tr_v4f16(inptr);
46+
}
47+

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2577,6 +2577,28 @@ def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
25772577

25782578
def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
25792579

2580+
class AMDGPULoadTr<LLVMType ptr_ty>:
2581+
Intrinsic<
2582+
[llvm_any_ty],
2583+
[ptr_ty],
2584+
[IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
2585+
"",
2586+
[SDNPMemOperand]
2587+
>;
2588+
2589+
// Wave32
2590+
// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
2591+
// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
2592+
// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
2593+
// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
2594+
// Wave64
2595+
// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
2596+
// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
2597+
// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
2598+
// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
2599+
2600+
def int_amdgcn_global_load_tr : AMDGPULoadTr<global_ptr_ty>;
2601+
25802602
//===----------------------------------------------------------------------===//
25812603
// Deep learning intrinsics.
25822604
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4864,6 +4864,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48644864
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
48654865
case Intrinsic::amdgcn_atomic_cond_sub_u32:
48664866
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4867+
case Intrinsic::amdgcn_global_load_tr:
48674868
return getDefaultMappingAllVGPR(MI);
48684869
case Intrinsic::amdgcn_ds_ordered_add:
48694870
case Intrinsic::amdgcn_ds_ordered_swap:

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
544544
Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
545545
if (Res)
546546
break;
547+
548+
Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
549+
if (Res)
550+
break;
547551
}
548552
// Reinitialize Bytes
549553
Bytes = Bytes_.slice(0, MaxInstBytesNum);

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,17 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_d
992992

993993
} // End SubtargetPredicate = HasFlatScratchInsts
994994

995+
let SubtargetPredicate = isGFX12Plus in {
996+
let WaveSizePredicate = isWave32 in {
997+
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
998+
defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>;
999+
}
1000+
let WaveSizePredicate = isWave64 in {
1001+
defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
1002+
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
1003+
}
1004+
} // End SubtargetPredicate = isGFX12Plus
1005+
9951006
let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
9961007
defm GLOBAL_ATOMIC_FCMPSWAP :
9971008
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
@@ -1590,6 +1601,19 @@ let SubtargetPredicate = isGFX12Plus in {
15901601

15911602
let OtherPredicates = [isGFX12Plus] in {
15921603
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
1604+
1605+
let WaveSizePredicate = isWave32 in {
1606+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
1607+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
1608+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
1609+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
1610+
}
1611+
let WaveSizePredicate = isWave64 in {
1612+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
1613+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
1614+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
1615+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
1616+
}
15931617
}
15941618

15951619
let OtherPredicates = [isGFX10Plus] in {
@@ -2726,6 +2750,17 @@ defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_A
27262750
defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
27272751
defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
27282752
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;
2753+
2754+
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
2755+
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">;
2756+
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">;
2757+
}
2758+
2759+
let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in {
2760+
defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">;
2761+
defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">;
2762+
}
2763+
27292764
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
27302765
defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059, "GLOBAL_ATOMIC_PK_ADD_F16", "global_atomic_pk_add_f16">;
27312766
defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a, "GLOBAL_ATOMIC_PK_ADD_BF16", "global_atomic_pk_add_bf16">;

0 commit comments

Comments
 (0)