Skip to content

[AMDGPU][True16][CodeGen] readfirstlane for vgpr16 copy to sgpr32 #137848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1086,10 +1086,22 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
// HACK to handle possible 16bit VGPR source
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
"we have 16-bit VGPRs");
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const DebugLoc &DL = MI->getDebugLoc();
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32)
.addImm(0)
.addReg(SrcReg, 0)
.addImm(AMDGPU::lo16);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(VReg32);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
Expand Down
9 changes: 1 addition & 8 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1472,16 +1472,9 @@ def : GCNPat <

} // End OtherPredicates = [isGFX8Plus, p]

let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat<
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;
} // End True16Predicate = UseFakeTrue16Insts

let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
(i32 (UniformUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;

Expand Down
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,66 @@ body: |
%3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...

---
name: vgpr16_to_spgr32
body: |
; GCN-LABEL: name: vgpr16_to_spgr32
; GCN: bb.0.entry:
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16
; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[SUBREG_TO_REG]], implicit $exec
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]]
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]]
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]]
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]]
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]]
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)

%5:sreg_32 = IMPLICIT_DEF
%6:vgpr_32 = COPY %5:sreg_32
%4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
%7:sgpr_32 = COPY %4.sub0:vreg_64
%8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec
%9:sreg_32 = S_MOV_B32 65535
%11:sreg_32 = COPY %8:vgpr_16
%10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc
%12:sreg_32 = S_MOV_B32 5
%13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32
%14:sreg_32 = S_MOV_B32 2
S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit $scc
S_BRANCH %bb.1
bb.1:
%17:sreg_32 = S_MOV_B32 1
%18:sreg_32 = S_MOV_B32 killed %17:sreg_32
$sgpr0 = COPY %18:sreg_32
SI_RETURN_TO_EPILOG $sgpr0
bb.2:
%15:sreg_32 = S_MOV_B32 2
%16:sreg_32 = S_MOV_B32 killed %15:sreg_32
$sgpr0 = COPY %16:sreg_32
SI_RETURN_TO_EPILOG $sgpr0
...
41 changes: 41 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s

; expect readfirstlane to pick the 32bit register
define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) {
; CHECK-LABEL: vgpr16_copyto_sgpr:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
; CHECK-NEXT: s_mul_i32 s0, s0, 5
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_lg_u32 s0, 2
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %a1
; CHECK-NEXT: s_mov_b32 s0, 1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_2: ; %a2
; CHECK-NEXT: s_mov_b32 s0, 2
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_3:
entry:
%1 = load <4 x float>, ptr addrspace(3) poison, align 4
%2 = extractelement <4 x float> %1, i32 0
%3 = fptrunc float %2 to half
%4 = bitcast half %3 to i16
%5 = zext i16 %4 to i32
%6 = add i32 %5, 1
%7 = mul i32 %6, 5
%8 = icmp eq i32 %7, 7
br i1 %8, label %a1, label %a2

a1:
ret i32 1

a2:
ret i32 2
}