Skip to content

Commit 269cefb

Browse files
AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (llvm#102130)
isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only.
1 parent e842998 commit 269cefb

File tree

3 files changed

+74
-100
lines changed

3 files changed

+74
-100
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 69 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
13721372
MachineInstrBuilder SelectedMI;
13731373
MachineOperand &LHS = I.getOperand(2);
13741374
MachineOperand &RHS = I.getOperand(3);
1375-
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1376-
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1375+
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1376+
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
13771377
Register Src0Reg =
13781378
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
13791379
Register Src1Reg =
@@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
24672467
return false;
24682468
}
24692469

2470+
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2471+
return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2472+
}
2473+
2474+
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2475+
Register BitcastSrc;
2476+
if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2477+
Reg = BitcastSrc;
2478+
return Reg;
2479+
}
2480+
24702481
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
24712482
Register &Out) {
2483+
Register Trunc;
2484+
if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2485+
return false;
2486+
24722487
Register LShlSrc;
2473-
if (mi_match(In, MRI,
2474-
m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2475-
Out = LShlSrc;
2488+
Register Cst;
2489+
if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2490+
Cst = stripCopy(Cst, MRI);
2491+
if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2492+
Out = stripBitCast(LShlSrc, MRI);
2493+
return true;
2494+
}
2495+
}
2496+
2497+
MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2498+
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2499+
return false;
2500+
2501+
assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2502+
LLT::fixed_vector(2, 16));
2503+
2504+
ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2505+
assert(Mask.size() == 2);
2506+
2507+
if (Mask[0] == 1 && Mask[1] <= 1) {
2508+
Out = Shuffle->getOperand(0).getReg();
24762509
return true;
24772510
}
2511+
24782512
return false;
24792513
}
24802514

@@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
35503584

35513585
}
35523586

3553-
std::pair<Register, unsigned>
3554-
AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3555-
bool IsCanonicalizing,
3556-
bool AllowAbs, bool OpSel) const {
3557-
Register Src = Root.getReg();
3587+
std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
3588+
Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
35583589
unsigned Mods = 0;
35593590
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
35603591

@@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
36173648
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
36183649
Register Src;
36193650
unsigned Mods;
3620-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3651+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
36213652

36223653
return {{
36233654
[=](MachineInstrBuilder &MIB) {
@@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
36333664
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
36343665
Register Src;
36353666
unsigned Mods;
3636-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3667+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
36373668
/*IsCanonicalizing=*/true,
36383669
/*AllowAbs=*/false);
36393670

@@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
36603691
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
36613692
Register Src;
36623693
unsigned Mods;
3663-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3694+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
36643695

36653696
return {{
36663697
[=](MachineInstrBuilder &MIB) {
@@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
36753706
MachineOperand &Root) const {
36763707
Register Src;
36773708
unsigned Mods;
3678-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3709+
std::tie(Src, Mods) =
3710+
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
36793711

36803712
return {{
36813713
[=](MachineInstrBuilder &MIB) {
@@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
36893721
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
36903722
Register Src;
36913723
unsigned Mods;
3692-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3693-
/*AllowAbs=*/false);
3724+
std::tie(Src, Mods) =
3725+
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
3726+
/*AllowAbs=*/false);
36943727

36953728
return {{
36963729
[=](MachineInstrBuilder &MIB) {
@@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
40164049
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
40174050
Register Src;
40184051
unsigned Mods;
4019-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4052+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
40204053

40214054
// FIXME: Handle op_sel
40224055
return {{
@@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
40294062
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
40304063
Register Src;
40314064
unsigned Mods;
4032-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4065+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
40334066
/*IsCanonicalizing=*/true,
40344067
/*AllowAbs=*/false,
40354068
/*OpSel=*/false);
@@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
40474080
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
40484081
Register Src;
40494082
unsigned Mods;
4050-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4083+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
40514084
/*IsCanonicalizing=*/true,
40524085
/*AllowAbs=*/false,
40534086
/*OpSel=*/true);
@@ -5229,97 +5262,41 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
52295262
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
52305263
}
52315264

5232-
// Variant of stripBitCast that returns the instruction instead of a
5233-
// MachineOperand.
5234-
static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5235-
if (MI->getOpcode() == AMDGPU::G_BITCAST)
5236-
return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5237-
return MI;
5238-
}
5239-
5240-
// Figure out if this is really an extract of the high 16-bits of a dword,
5241-
// returns nullptr if it isn't.
5242-
static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5243-
MachineRegisterInfo &MRI) {
5244-
Inst = stripBitCast(Inst, MRI);
5245-
5246-
if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5247-
return nullptr;
5248-
5249-
MachineInstr *TruncOp =
5250-
getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5251-
TruncOp = stripBitCast(TruncOp, MRI);
5252-
5253-
// G_LSHR x, (G_CONSTANT i32 16)
5254-
if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5255-
auto SrlAmount = getIConstantVRegValWithLookThrough(
5256-
TruncOp->getOperand(2).getReg(), MRI);
5257-
if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5258-
MachineInstr *SrlOp =
5259-
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5260-
return stripBitCast(SrlOp, MRI);
5261-
}
5262-
}
5263-
5264-
// G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5265-
// 1, 0 swaps the low/high 16 bits.
5266-
// 1, 1 sets the high 16 bits to be the same as the low 16.
5267-
// in any case, it selects the high elts.
5268-
if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5269-
assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5270-
LLT::fixed_vector(2, 16));
5271-
5272-
ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5273-
assert(Mask.size() == 2);
5274-
5275-
if (Mask[0] == 1 && Mask[1] <= 1) {
5276-
MachineInstr *LHS =
5277-
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5278-
return stripBitCast(LHS, MRI);
5279-
}
5280-
}
5281-
5282-
return nullptr;
5283-
}
5284-
52855265
std::pair<Register, unsigned>
52865266
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
52875267
bool &Matched) const {
52885268
Matched = false;
52895269

52905270
Register Src;
52915271
unsigned Mods;
5292-
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5293-
5294-
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5295-
if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5296-
MachineOperand *MO = &MI->getOperand(1);
5297-
Src = MO->getReg();
5298-
MI = getDefIgnoringCopies(Src, *MRI);
5272+
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
52995273

5274+
if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
53005275
assert(MRI->getType(Src) == LLT::scalar(16));
53015276

5302-
// See through bitcasts.
5303-
// FIXME: Would be nice to use stripBitCast here.
5304-
if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5305-
MO = &MI->getOperand(1);
5306-
Src = MO->getReg();
5307-
MI = getDefIgnoringCopies(Src, *MRI);
5308-
}
5277+
// Only change Src if src modifier could be gained. In such cases new Src
5278+
// could be sgpr but this does not violate constant bus restriction for
5279+
// instruction that is being selected.
5280+
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
5281+
// since this could violate constant bus restriction.
5282+
Register PeekSrc = stripCopy(Src, *MRI);
53095283

53105284
const auto CheckAbsNeg = [&]() {
53115285
// Be careful about folding modifiers if we already have an abs. fneg is
53125286
// applied last, so we don't want to apply an earlier fneg.
53135287
if ((Mods & SISrcMods::ABS) == 0) {
53145288
unsigned ModsTmp;
5315-
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5316-
MI = getDefIgnoringCopies(Src, *MRI);
5289+
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
53175290

5318-
if ((ModsTmp & SISrcMods::NEG) != 0)
5291+
if ((ModsTmp & SISrcMods::NEG) != 0) {
53195292
Mods ^= SISrcMods::NEG;
5293+
Src = PeekSrc;
5294+
}
53205295

5321-
if ((ModsTmp & SISrcMods::ABS) != 0)
5296+
if ((ModsTmp & SISrcMods::ABS) != 0) {
53225297
Mods |= SISrcMods::ABS;
5298+
Src = PeekSrc;
5299+
}
53235300
}
53245301
};
53255302

@@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
53325309

53335310
Mods |= SISrcMods::OP_SEL_1;
53345311

5335-
if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5312+
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
5313+
Src = PeekSrc;
53365314
Mods |= SISrcMods::OP_SEL_0;
5337-
MI = ExtractHiEltMI;
5338-
MO = &MI->getOperand(0);
5339-
Src = MO->getReg();
5340-
53415315
CheckAbsNeg();
53425316
}
53435317

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
150150
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
151151
bool selectSBarrierLeave(MachineInstr &I) const;
152152

153-
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
153+
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
154154
bool IsCanonicalizing = true,
155155
bool AllowAbs = true,
156156
bool OpSel = false) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
446446
; GFX9-DENORM: ; %bb.0: ; %.entry
447447
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
448448
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
449-
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
449+
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
450450
; GFX9-DENORM-NEXT: ; return to shader part epilog
451451
;
452452
; GFX10-LABEL: test_matching_source_from_unmerge:
453453
; GFX10: ; %bb.0: ; %.entry
454454
; GFX10-NEXT: ds_read_b64 v[2:3], v0
455455
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
456-
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
456+
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
457457
; GFX10-NEXT: ; return to shader part epilog
458458
;
459459
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
460460
; GFX10-CONTRACT: ; %bb.0: ; %.entry
461461
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
462462
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
463-
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
463+
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
464464
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
465465
;
466466
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
467467
; GFX10-DENORM: ; %bb.0: ; %.entry
468468
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
469469
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
470-
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
470+
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
471471
; GFX10-DENORM-NEXT: ; return to shader part epilog
472472
.entry:
473473
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16

0 commit comments

Comments
 (0)