Skip to content

[AMDGPU] Implement moveToVALU for S_CSELECT_B64 #70352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4727,8 +4727,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const SIRegisterInfo *TRI = ST.getRegisterInfo();

Register Dst = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
const MachineOperand &Src0 = MI.getOperand(1);
const MachineOperand &Src1 = MI.getOperand(2);
const DebugLoc &DL = MI.getDebugLoc();
Register SrcCond = MI.getOperand(3).getReg();

Expand All @@ -4737,20 +4737,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register SrcCondCopy = MRI.createVirtualRegister(CondRC);

const TargetRegisterClass *Src0RC = Src0.isReg()
? MRI.getRegClass(Src0.getReg())
: &AMDGPU::VReg_64RegClass;
const TargetRegisterClass *Src1RC = Src1.isReg()
? MRI.getRegClass(Src1.getReg())
: &AMDGPU::VReg_64RegClass;

const TargetRegisterClass *Src0SubRC =
TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
const TargetRegisterClass *Src1SubRC =
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);

MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);

MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow! I didn't know it does exist. Seems useful.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is mostly copied from the handling of S_ADD_U64_PSEUDO and other 64-bit pseudos that get split.

MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);

BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
.addImm(0)
.addReg(Src0, 0, AMDGPU::sub0)
.addImm(0)
.addReg(Src1, 0, AMDGPU::sub0)
.addReg(SrcCondCopy);
.addImm(0)
.add(Src0Sub0)
.addImm(0)
.add(Src1Sub0)
.addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
.addImm(0)
.addReg(Src0, 0, AMDGPU::sub1)
.addImm(0)
.addReg(Src1, 0, AMDGPU::sub1)
.addReg(SrcCondCopy);
.addImm(0)
.add(Src0Sub1)
.addImm(0)
.add(Src1Sub1)
.addReg(SrcCondCopy);

BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
Expand Down
59 changes: 33 additions & 26 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7223,35 +7223,35 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineOperand &Src1 = Inst.getOperand(2);
MachineOperand &Cond = Inst.getOperand(3);

Register SCCSource = Cond.getReg();
bool IsSCC = (SCCSource == AMDGPU::SCC);
Register CondReg = Cond.getReg();
bool IsSCC = (CondReg == AMDGPU::SCC);

// If this is a trivial select where the condition is effectively not SCC
// (SCCSource is a source of copy to SCC), then the select is semantically
// equivalent to copying SCCSource. Hence, there is no need to create
// (CondReg is a source of copy to SCC), then the select is semantically
// equivalent to copying CondReg. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
(Src1.getImm() == 0)) {
MRI.replaceRegWith(Dest.getReg(), SCCSource);
MRI.replaceRegWith(Dest.getReg(), CondReg);
return;
}

const TargetRegisterClass *TC =
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);

Register CopySCC = MRI.createVirtualRegister(TC);

Register NewCondReg = CondReg;
if (IsSCC) {
const TargetRegisterClass *TC =
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
NewCondReg = MRI.createVirtualRegister(TC);

// Now look for the closest SCC def if it is a copy
// replacing the SCCSource with the COPY source register
// replacing the CondReg with the COPY source register
bool CopyFound = false;
for (MachineInstr &CandI :
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
Inst.getParent()->rend())) {
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-1) {
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
.addReg(CandI.getOperand(1).getReg());
CopyFound = true;
}
Expand All @@ -7266,24 +7266,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
}
}

Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

auto UpdatedInst =
BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
.addImm(0)
.add(Src1) // False
.addImm(0)
.add(Src0) // True
.addReg(IsSCC ? CopySCC : SCCSource);

MRI.replaceRegWith(Dest.getReg(), ResultReg);
legalizeOperands(*UpdatedInst, MDT);
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
Register NewDestReg = MRI.createVirtualRegister(
RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
MachineInstr *NewInst;
if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
.addImm(0)
.add(Src1) // False
.addImm(0)
.add(Src0) // True
.addReg(NewCondReg);
} else {
NewInst =
BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
.add(Src1) // False
.add(Src0) // True
.addReg(NewCondReg);
}
MRI.replaceRegWith(Dest.getReg(), NewDestReg);
legalizeOperands(*NewInst, MDT);
addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
}

void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
Expand Down
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s

define amdgpu_cs <2 x i32> @f() {
; CHECK-LABEL: f:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s2, s0
; CHECK-NEXT: s_mov_b32 s3, s0
; CHECK-NEXT: s_mov_b32 s4, s0
; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; CHECK-NEXT: s_mov_b32 s5, s0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; CHECK-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; CHECK-NEXT: ; return to shader part epilog
bb:
%i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
%i1 = bitcast <2 x i32> %i to i64
%i2 = insertelement <3 x i64> zeroinitializer, i64 %i1, i64 2
%i3 = icmp ne <3 x i64> %i2, zeroinitializer
%i4 = zext <3 x i1> %i3 to <3 x i64>
%i5 = bitcast <3 x i64> %i4 to <6 x i32>
%i6 = shufflevector <6 x i32> %i5, <6 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %i6, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
ret <2 x i32> %i6
}

declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32 immarg)
declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32 immarg)
19 changes: 19 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,22 @@ body: |
%0:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
%1:sreg_32 = COPY %0.sub0
...

---
# GCN-LABEL: name: s_cselect_b64
# GCN: %0:vgpr_32 = IMPLICIT_DEF
# GCN: %1:vreg_64 = IMPLICIT_DEF
# GCN: %2:sreg_32 = IMPLICIT_DEF
# GCN: %3:sreg_64 = IMPLICIT_DEF
# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec
name: s_cselect_b64
body: |
bb.0:
%0:vgpr_32 = IMPLICIT_DEF
%1:vreg_64 = IMPLICIT_DEF
%2:sreg_32 = COPY %0
%3:sreg_64 = COPY %1
S_CMP_EQ_U32 %2, 0, implicit-def $scc
%4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc
...