Skip to content

[AMDGPU][True16][CodeGen] readfirstlane for vgpr16 copy to sgpr32 #118037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1086,10 +1086,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you reaching here in practice?

Copy link
Contributor Author

@broxigarchen broxigarchen Jan 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Matt. I think we probably never hit this before in the previous code. But with the true16 flow, we should hit here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not exactly, we don't want SIFixSGPRCopies to need to handle anything

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Matt. Can you help to elaborate a bit here? Do you think we should not see this 16bit handling in this pass? Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does need to be handled here, but if you're seeing this in practice something else is likely missing

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code fixed the conformance test I was looking at, and as you say it should be handled here, so I think it should be merged.
Regarding an 'alternative fix', do you have any hypothesis what that would be? Is it a priority to find that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It depends what it is. Either way that indicates we should have an end to end IR test for whatever this issue was as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated this PR's decsription to answer this question

// HACK to handle possible 16bit VGPR source
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
"we have 16-bit VGPRs");
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const DebugLoc &DL = MI->getDebugLoc();
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
.addReg(SrcReg, 0, SubReg)
.addImm(AMDGPU::lo16)
.addReg(Undef)
.addImm(AMDGPU::hi16);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(VReg32);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
Expand Down
9 changes: 1 addition & 8 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1472,16 +1472,9 @@ def : GCNPat <

} // End OtherPredicates = [isGFX8Plus, p]

let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat<
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;
} // End True16Predicate = UseFakeTrue16Insts

let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
(i32 (UniformUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;

Expand Down
64 changes: 64 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,67 @@ body: |
%3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...

---
name: vgpr16_to_spgr32
body: |
; GCN-LABEL: name: vgpr16_to_spgr32
; GCN: bb.0.entry:
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]]
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]]
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]]
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]]
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]]
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)

%5:sreg_32 = IMPLICIT_DEF
%6:vgpr_32 = COPY %5:sreg_32
%4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
%7:sgpr_32 = COPY %4.sub0:vreg_64
%8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec
%9:sreg_32 = S_MOV_B32 65535
%11:sreg_32 = COPY %8:vgpr_16
%10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc
%12:sreg_32 = S_MOV_B32 5
%13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32
%14:sreg_32 = S_MOV_B32 2
S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit $scc
S_BRANCH %bb.1
bb.1:
%17:sreg_32 = S_MOV_B32 1
%18:sreg_32 = S_MOV_B32 killed %17:sreg_32
$sgpr0 = COPY %18:sreg_32
SI_RETURN_TO_EPILOG $sgpr0
bb.2:
%15:sreg_32 = S_MOV_B32 2
%16:sreg_32 = S_MOV_B32 killed %15:sreg_32
$sgpr0 = COPY %16:sreg_32
SI_RETURN_TO_EPILOG $sgpr0
...
44 changes: 44 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s

@lds = external local_unnamed_addr addrspace(3) global [4 x float], align 4

; expect readfirstlane to pick the 32bit register
define amdgpu_gs i32 @vgpr16_copyto_sgpr() {
; CHECK-LABEL: vgpr16_copyto_sgpr:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_mov_b32_e32 v0, lds@abs32@lo
; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
; CHECK-NEXT: s_mul_i32 s0, s0, 5
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_lg_u32 s0, 2
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %a1
; CHECK-NEXT: s_mov_b32 s0, 1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_2: ; %a2
; CHECK-NEXT: s_mov_b32 s0, 2
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_3:
entry:
%ptr = load <4 x float>, ptr addrspace(3) @lds, align 4
%f = extractelement <4 x float> %ptr, i32 0
%half = fptrunc float %f to half
%i16 = bitcast half %half to i16
%i32 = zext i16 %i16 to i32
%add = add i32 %i32, 1
%mul = mul i32 %add, 5
%icmp = icmp eq i32 %mul, 7
br i1 %icmp, label %a1, label %a2

a1:
ret i32 1

a2:
ret i32 2
}
Loading