Skip to content

AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix #102130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 69 additions & 95 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
MachineInstrBuilder SelectedMI;
MachineOperand &LHS = I.getOperand(2);
MachineOperand &RHS = I.getOperand(3);
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
Register Src0Reg =
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
Register Src1Reg =
Expand Down Expand Up @@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}

static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
}

static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
Register BitcastSrc;
if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
Reg = BitcastSrc;
return Reg;
}

static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
Register &Out) {
Register Trunc;
if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
return false;

Register LShlSrc;
if (mi_match(In, MRI,
m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
Out = LShlSrc;
Register Cst;
if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
Cst = stripCopy(Cst, MRI);
if (mi_match(Cst, MRI, m_SpecificICst(16))) {
Out = stripBitCast(LShlSrc, MRI);
return true;
}
}

MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
return false;

assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));

ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);

if (Mask[0] == 1 && Mask[1] <= 1) {
Out = Shuffle->getOperand(0).getReg();
return true;
}

return false;
}

Expand Down Expand Up @@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {

}

std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
bool IsCanonicalizing,
bool AllowAbs, bool OpSel) const {
Register Src = Root.getReg();
std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);

Expand Down Expand Up @@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

return {{
[=](MachineInstrBuilder &MIB) {
Expand All @@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false);

Expand All @@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

return {{
[=](MachineInstrBuilder &MIB) {
Expand All @@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
std::tie(Src, Mods) =
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);

return {{
[=](MachineInstrBuilder &MIB) {
Expand All @@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
std::tie(Src, Mods) =
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
/*AllowAbs=*/false);

return {{
[=](MachineInstrBuilder &MIB) {
Expand Down Expand Up @@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

// FIXME: Handle op_sel
return {{
Expand All @@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/false);
Expand All @@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/true);
Expand Down Expand Up @@ -5229,97 +5262,41 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}

// Variant of stripBitCast that returns the instruction instead of a
// MachineOperand.
static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
if (MI->getOpcode() == AMDGPU::G_BITCAST)
return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
return MI;
}

// Figure out if this is really an extract of the high 16-bits of a dword,
// returns nullptr if it isn't.
static MachineInstr *isExtractHiElt(MachineInstr *Inst,
MachineRegisterInfo &MRI) {
Inst = stripBitCast(Inst, MRI);

if (Inst->getOpcode() != AMDGPU::G_TRUNC)
return nullptr;

MachineInstr *TruncOp =
getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
TruncOp = stripBitCast(TruncOp, MRI);

// G_LSHR x, (G_CONSTANT i32 16)
if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
auto SrlAmount = getIConstantVRegValWithLookThrough(
TruncOp->getOperand(2).getReg(), MRI);
if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
MachineInstr *SrlOp =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(SrlOp, MRI);
}
}

// G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
// 1, 0 swaps the low/high 16 bits.
// 1, 1 sets the high 16 bits to be the same as the low 16.
// in any case, it selects the high elts.
if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));

ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);

if (Mask[0] == 1 && Mask[1] <= 1) {
MachineInstr *LHS =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(LHS, MRI);
}
}

return nullptr;
}

std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
Matched = false;

Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);

MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
if (MI->getOpcode() == AMDGPU::G_FPEXT) {
MachineOperand *MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
assert(MRI->getType(Src) == LLT::scalar(16));

// See through bitcasts.
// FIXME: Would be nice to use stripBitCast here.
if (MI->getOpcode() == AMDGPU::G_BITCAST) {
MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
}
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
// since this could violate constant bus restriction.
Register PeekSrc = stripCopy(Src, *MRI);

const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
MI = getDefIgnoringCopies(Src, *MRI);
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);

if ((ModsTmp & SISrcMods::NEG) != 0)
if ((ModsTmp & SISrcMods::NEG) != 0) {
Mods ^= SISrcMods::NEG;
Src = PeekSrc;
}

if ((ModsTmp & SISrcMods::ABS) != 0)
if ((ModsTmp & SISrcMods::ABS) != 0) {
Mods |= SISrcMods::ABS;
Src = PeekSrc;
}
}
};

Expand All @@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,

Mods |= SISrcMods::OP_SEL_1;

if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
Src = PeekSrc;
Mods |= SISrcMods::OP_SEL_0;
MI = ExtractHiEltMI;
MO = &MI->getOperand(0);
Src = MO->getReg();

CheckAbsNeg();
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;

std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_matching_source_from_unmerge:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16
Expand Down
Loading