Skip to content

[AMDGPU] Add global_load_tr for GFX12 #77772

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,13 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")

TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")

TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")

#undef BUILTIN
#undef TARGET_BUILTIN
39 changes: 39 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18178,6 +18178,45 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
}
case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {

llvm::Type *ArgTy;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt32Ty(getLLVMContext()), 2);
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getHalfTy(getLLVMContext()), 4);
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 4);
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getHalfTy(getLLVMContext()), 8);
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 8);
break;
}

llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
llvm::Function *F =
CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm \
// RUN: -verify -S -o - %s

// REQUIRES: amdgpu-registered-target

typedef int v2i __attribute__((ext_vector_type(2)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));

typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));



void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
{
v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}

int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm \
// RUN: -verify -S -o - %s

// REQUIRES: amdgpu-registered-target

typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));

void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
{
int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm \
// RUN: -verify -S -o - %s

// REQUIRES: amdgpu-registered-target

typedef int v2i __attribute__((ext_vector_type(2)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));

void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
{
v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
}

48 changes: 48 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200

typedef int v2i __attribute__((ext_vector_type(2)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));

// Wave32

//
// amdgcn_global_load_tr
//

// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
{
return __builtin_amdgcn_global_load_tr_v2i32(inptr);
}

//
// amdgcn_global_load_tr
//

// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
//
v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
{
return __builtin_amdgcn_global_load_tr_v8i16(inptr);
}

// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
//
v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
{
return __builtin_amdgcn_global_load_tr_v8f16(inptr);
}

47 changes: 47 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200

typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));

// Wave64

//
// amdgcn_global_load_tr
//

// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
//
int test_amdgcn_global_load_tr_i32(global int* inptr)
{
return __builtin_amdgcn_global_load_tr_i32(inptr);
}

//
// amdgcn_global_load_tr
//

// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
//
v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
{
return __builtin_amdgcn_global_load_tr_v4i16(inptr);
}

// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
// CHECK-GFX1200-NEXT: entry:
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
//
v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
{
return __builtin_amdgcn_global_load_tr_v4f16(inptr);
}

22 changes: 22 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2532,6 +2532,28 @@ def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;

def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;

class AMDGPULoadTr<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],
[ptr_ty],
[IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
"",
[SDNPMemOperand]
>;

// Wave32
// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
// Wave64
// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128

def int_amdgcn_global_load_tr : AMDGPULoadTr<global_ptr_ty>;

//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4858,6 +4858,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
if (Res)
break;

Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
if (Res)
break;
}
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
Expand Down
35 changes: 35 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,17 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_d

} // End SubtargetPredicate = HasFlatScratchInsts

let SubtargetPredicate = isGFX12Plus in {
let WaveSizePredicate = isWave32 in {
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>;
}
let WaveSizePredicate = isWave64 in {
defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
}
} // End SubtargetPredicate = isGFX12Plus

let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_FCMPSWAP :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
Expand Down Expand Up @@ -1590,6 +1601,19 @@ let SubtargetPredicate = isGFX12Plus in {

let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;

let WaveSizePredicate = isWave32 in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
}
let WaveSizePredicate = isWave64 in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
}
}

let OtherPredicates = [isGFX10Plus] in {
Expand Down Expand Up @@ -2724,6 +2748,17 @@ defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_A
defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;

let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">;
}

let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in {
defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">;
defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">;
}

defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;

defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>;
Expand Down
Loading