-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix #102130
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. Join @petar-avramovic and the rest of your teammates on |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesisExtractHiElt should return new source register instead of returning Full diff: https://github.com/llvm/llvm-project/pull/102130.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 73f3921b2ff4c..f78699f88de56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
MachineInstrBuilder SelectedMI;
MachineOperand &LHS = I.getOperand(2);
MachineOperand &RHS = I.getOperand(3);
- auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
- auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
+ auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
+ auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
Register Src0Reg =
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
Register Src1Reg =
@@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}
+static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
+ return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
+}
+
+static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
+ Register BitcastSrc;
+ if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
+ Reg = BitcastSrc;
+ return Reg;
+}
+
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
Register &Out) {
+ Register Trunc;
+ if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
+ return false;
+
Register LShlSrc;
- if (mi_match(In, MRI,
- m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
- Out = LShlSrc;
+ Register Cst;
+ if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
+ Cst = stripCopy(Cst, MRI);
+ if (mi_match(Cst, MRI, m_SpecificICst(16))) {
+ Out = stripBitCast(LShlSrc, MRI);
+ return true;
+ }
+ }
+
+ MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
+ if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
+ return false;
+
+ assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
+ LLT::fixed_vector(2, 16));
+
+ ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
+ assert(Mask.size() == 2);
+
+ if (Mask[0] == 1 && Mask[1] <= 1) {
+ Out = Shuffle->getOperand(0).getReg();
return true;
}
+
return false;
}
@@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
- bool IsCanonicalizing,
- bool AllowAbs, bool OpSel) const {
- Register Src = Root.getReg();
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
@@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
@@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
+ std::tie(Src, Mods) =
+ selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
- /*AllowAbs=*/false);
+ std::tie(Src, Mods) =
+ selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
// FIXME: Handle op_sel
return {{
@@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/false);
@@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/true);
@@ -5229,59 +5262,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
-// Variant of stripBitCast that returns the instruction instead of a
-// MachineOperand.
-static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
- if (MI->getOpcode() == AMDGPU::G_BITCAST)
- return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
- return MI;
-}
-
-// Figure out if this is really an extract of the high 16-bits of a dword,
-// returns nullptr if it isn't.
-static MachineInstr *isExtractHiElt(MachineInstr *Inst,
- MachineRegisterInfo &MRI) {
- Inst = stripBitCast(Inst, MRI);
-
- if (Inst->getOpcode() != AMDGPU::G_TRUNC)
- return nullptr;
-
- MachineInstr *TruncOp =
- getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
- TruncOp = stripBitCast(TruncOp, MRI);
-
- // G_LSHR x, (G_CONSTANT i32 16)
- if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
- auto SrlAmount = getIConstantVRegValWithLookThrough(
- TruncOp->getOperand(2).getReg(), MRI);
- if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
- MachineInstr *SrlOp =
- getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
- return stripBitCast(SrlOp, MRI);
- }
- }
-
- // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
- // 1, 0 swaps the low/high 16 bits.
- // 1, 1 sets the high 16 bits to be the same as the low 16.
- // in any case, it selects the high elts.
- if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
- assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
- LLT::fixed_vector(2, 16));
-
- ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
- assert(Mask.size() == 2);
-
- if (Mask[0] == 1 && Mask[1] <= 1) {
- MachineInstr *LHS =
- getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
- return stripBitCast(LHS, MRI);
- }
- }
-
- return nullptr;
-}
-
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
@@ -5289,37 +5269,34 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
-
- MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
- if (MI->getOpcode() == AMDGPU::G_FPEXT) {
- MachineOperand *MO = &MI->getOperand(1);
- Src = MO->getReg();
- MI = getDefIgnoringCopies(Src, *MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
assert(MRI->getType(Src) == LLT::scalar(16));
- // See through bitcasts.
- // FIXME: Would be nice to use stripBitCast here.
- if (MI->getOpcode() == AMDGPU::G_BITCAST) {
- MO = &MI->getOperand(1);
- Src = MO->getReg();
- MI = getDefIgnoringCopies(Src, *MRI);
- }
+ // Only change Src if src modifier could be gained. In such cases new Src
+ // could be sgpr but this does not violate constant bus restriction for
+ // instruction that is being selected.
+ // Note: Src is not changed when there is only a simple sgpr to vgpr copy
+ // since this could violate constant bus restriction.
+ Register PeekSrc = stripCopy(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
- std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
- MI = getDefIgnoringCopies(Src, *MRI);
+ std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
- if ((ModsTmp & SISrcMods::NEG) != 0)
+ if ((ModsTmp & SISrcMods::NEG) != 0) {
Mods ^= SISrcMods::NEG;
+ Src = PeekSrc;
+ }
- if ((ModsTmp & SISrcMods::ABS) != 0)
+ if ((ModsTmp & SISrcMods::ABS) != 0) {
Mods |= SISrcMods::ABS;
+ Src = PeekSrc;
+ }
}
};
@@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
- if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
+ if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
+ Src = PeekSrc;
Mods |= SISrcMods::OP_SEL_0;
- MI = ExtractHiEltMI;
- MO = &MI->getOperand(0);
- Src = MO->getReg();
-
CheckAbsNeg();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 7fff7d2685e7f..69806b240cf2b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -150,7 +150,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
- std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
+ std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
index e910c2eca2ced..b2b433167fe4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
@@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_matching_source_from_unmerge:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16
|
isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only.
b6c03d1
to
3ff5561
Compare
Merge activity
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/3043 Here is the relevant piece of the build log for the reference:
|
…2130) isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only. (cherry picked from commit 269cefb) Change-Id: I2dcc950ff7c5d68c1d6e5ee8e71c39a100f549b8
isExtractHiElt should return new source register instead of returning
instruction that defines it. Src = MI.getOperand(0).getReg() is not
correct when MI(for example G_UNMERGE_VALUES) defines multiple registers.
Refactor existing code to work with source registers only.