Skip to content

Commit 1a70de0

Browse files
committed
[AMDGPU][GlobalISel] Align selectVOP3PMadMixModsImpl with the SelectionDAG counterpart (llvm#110168)
The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction that violates constant bus restriction, while its `SelectionDAG` counterpart doesn't. The culprit is in the copy stripping while the `SelectionDAG` version only has a bitcast stripping. This PR simply aligns the two version. (cherry picked from commit 48ac846) Change-Id: I33ca6f072d443b9571675d7bee864d7fb3c4e1cc
1 parent 8392149 commit 1a70de0

File tree

3 files changed

+25
-22
lines changed

3 files changed

+25
-22
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5381,26 +5381,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
53815381
// Only change Src if src modifier could be gained. In such cases new Src
53825382
// could be sgpr but this does not violate constant bus restriction for
53835383
// instruction that is being selected.
5384-
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
5385-
// since this could violate constant bus restriction.
5386-
Register PeekSrc = stripCopy(Src, *MRI);
5384+
Src = stripBitCast(Src, *MRI);
53875385

53885386
const auto CheckAbsNeg = [&]() {
53895387
// Be careful about folding modifiers if we already have an abs. fneg is
53905388
// applied last, so we don't want to apply an earlier fneg.
53915389
if ((Mods & SISrcMods::ABS) == 0) {
53925390
unsigned ModsTmp;
5393-
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
5391+
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
53945392

5395-
if ((ModsTmp & SISrcMods::NEG) != 0) {
5393+
if ((ModsTmp & SISrcMods::NEG) != 0)
53965394
Mods ^= SISrcMods::NEG;
5397-
Src = PeekSrc;
5398-
}
53995395

5400-
if ((ModsTmp & SISrcMods::ABS) != 0) {
5396+
if ((ModsTmp & SISrcMods::ABS) != 0)
54015397
Mods |= SISrcMods::ABS;
5402-
Src = PeekSrc;
5403-
}
54045398
}
54055399
};
54065400

@@ -5413,8 +5407,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
54135407

54145408
Mods |= SISrcMods::OP_SEL_1;
54155409

5416-
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
5417-
Src = PeekSrc;
5410+
if (isExtractHiElt(*MRI, Src, Src)) {
54185411
Mods |= SISrcMods::OP_SEL_0;
54195412
CheckAbsNeg();
54205413
}

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
7373
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8
7474
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
7575
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
76+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16
77+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16
78+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16
79+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16
7680
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
77-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
81+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
7882
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
79-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
83+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
8084
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
8185
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
8286
.entry:
@@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
117121
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
118122
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
119123
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11
124+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16
125+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16
126+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16
127+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16
128+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16
129+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16
120130
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
121-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
131+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
122132
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
123-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
133+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
124134
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
125-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
135+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
126136
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
127137
.entry:
128138
%a = fmul fast <6 x half> %x, %y

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
25552555
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
25562556
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
25572557
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
2558-
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2559-
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
2560-
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
2558+
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
2559+
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
2560+
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
25612561
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
25622562
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
25632563
; GFX9-FLUSH-NEXT: ; return to shader part epilog
@@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
25712571
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
25722572
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
25732573
; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2574-
; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2574+
; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
25752575
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
25762576
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
25772577
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
25882588
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
25892589
; GFX11-NEXT: s_waitcnt_depctr 0xfff
25902590
; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2591-
; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2591+
; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
25922592
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
25932593
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
25942594
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1

0 commit comments

Comments
 (0)