Skip to content

AMDGPU: Add support for load transpose instructions for gfx950 #117378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")

TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")

//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
Expand Down
19 changes: 17 additions & 2 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19697,8 +19697,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {

case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
Expand All @@ -19713,6 +19716,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
IID = Intrinsic::amdgcn_ds_read_tr8_b64;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
break;
}
llvm::Type *LoadTy = ConvertType(E->getType());
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
Expand Down
50 changes: 50 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s

typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v3i __attribute__((ext_vector_type(3)));
typedef short v4s __attribute__((ext_vector_type(4)));

// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_ds_read_b64_tr_b4_v2i32(local v2i* inptr)
{
return __builtin_amdgcn_ds_read_tr4_b64_v2i32(inptr);
}

// GFX950-LABEL: define dso_local <3 x i32> @test_amdgcn_ds_read_b96_tr_b6_v3i32(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <3 x i32> [[TMP0]]
//
v3i test_amdgcn_ds_read_b96_tr_b6_v3i32(local v3i* inptr)
{
return __builtin_amdgcn_ds_read_tr6_b96_v3i32(inptr);
}

// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b8_v2i32(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_ds_read_b64_tr_b8_v2i32(local v2i* inptr)
{
return __builtin_amdgcn_ds_read_tr8_b64_v2i32(inptr);
}

// GFX950-LABEL: define dso_local <4 x i16> @test_amdgcn_ds_read_b64_tr_b16_v2i16(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <4 x i16> [[TMP0]]
//
v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
}
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2726,6 +2726,10 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:

def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;

// i32 @llvm.amdgcn.wave.id()
def int_amdgcn_wave_id :
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4967,6 +4967,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
case Intrinsic::amdgcn_ds_read_tr16_b64:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,11 @@ def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;

def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr8_b64>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr16_b64>;

// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
// uniform result, so the returned struct is collectively divergent.
Expand Down
33 changes: 33 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,12 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
}
}

multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
let has_m0_read = 0 in {
def "" : DS_1A_RET<opName, rc>;
}
}

class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
DS_1A_RET<opName, rc, 1>;

Expand Down Expand Up @@ -744,6 +750,13 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus

let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
}

//===----------------------------------------------------------------------===//
// DS Patterns
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1179,6 +1192,18 @@ def : GCNPat <
sub0)
>;

class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPat <
(vt (node (DS1Addr1Offset i32:$ptr, i32:$offset))),
(inst $ptr, Offset:$offset, (i1 0))
>;

let SubtargetPredicate = HasGFX950Insts in {
def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
}

//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1748,3 +1773,11 @@ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;

//===----------------------------------------------------------------------===//
// GFX950.
//===----------------------------------------------------------------------===//
def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1382,7 +1382,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128: {
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
case Intrinsic::amdgcn_ds_read_tr16_b64: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Expand Down Expand Up @@ -1477,6 +1481,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
case Intrinsic::amdgcn_ds_read_tr16_b64:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
Expand Down
44 changes: 44 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,50 @@ bb:
ret void
}

declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
ret void
}

declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16
ret void
}

declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
ret void
}

declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i16 4
%tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16
ret void
}

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)

Expand Down
108 changes: 108 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s

declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))

define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b4:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
store <3 x i32> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b64_tr_b8(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b8:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b8:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b64_tr_b16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep)
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}
Loading
Loading