Skip to content

Commit 57714ea

Browse files
committed
fix mad-mix pattern in sdag and gisel
1 parent 187b6b1 commit 57714ea

File tree

5 files changed

+125
-247
lines changed

5 files changed

+125
-247
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3651,6 +3651,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
36513651
// TODO: Should we try to look for neg/abs here?
36523652
}
36533653

3654+
// Prevent unnecessary subreg COPY to VGPR_16
3655+
if (Subtarget->useRealTrue16Insts() && Src.getOpcode() == ISD::TRUNCATE) {
3656+
Src = Src.getOperand(0);
3657+
}
36543658
return true;
36553659
}
36563660

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5841,6 +5841,14 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
58415841
CheckAbsNeg();
58425842
}
58435843

5844+
// Since we looked through FPEXT and removed it, we must also remove
5845+
// G_TRUNC. G_TRUNC to 16-bits would have a destination in RC VGPR_16, which
5846+
// is not compatible with MadMix instructions
5847+
Register PeekSrc = Src;
5848+
if (Subtarget->useRealTrue16Insts() &&
5849+
mi_match(PeekSrc, *MRI, m_GTrunc(m_Reg(PeekSrc))))
5850+
Src = PeekSrc;
5851+
58445852
Matched = true;
58455853
}
58465854

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -175,14 +175,12 @@ define half @v_fdiv_f16(half %a, half %b) {
175175
; GFX11-IEEE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176176
; GFX11-IEEE-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
177177
; GFX11-IEEE-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
178-
; GFX11-IEEE-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
179-
; GFX11-IEEE-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
180178
; GFX11-IEEE-TRUE16-NEXT: v_rcp_f32_e32 v2, v2
181179
; GFX11-IEEE-TRUE16-NEXT: s_waitcnt_depctr 0xfff
182180
; GFX11-IEEE-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v2
183-
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v3, v5 op_sel_hi:[1,0,1]
184-
; GFX11-IEEE-TRUE16-NEXT: v_fmac_f32_e32 v3, v6, v2
185-
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v4, -v4, v3, v5 op_sel_hi:[1,0,1]
181+
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
182+
; GFX11-IEEE-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2
183+
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
186184
; GFX11-IEEE-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v2
187185
; GFX11-IEEE-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2
188186
; GFX11-IEEE-TRUE16-NEXT: v_add_f32_e32 v2, v2, v3
@@ -213,14 +211,12 @@ define half @v_fdiv_f16(half %a, half %b) {
213211
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214212
; GFX11-FLUSH-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
215213
; GFX11-FLUSH-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
216-
; GFX11-FLUSH-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
217-
; GFX11-FLUSH-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
218214
; GFX11-FLUSH-TRUE16-NEXT: v_rcp_f32_e32 v2, v2
219215
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_depctr 0xfff
220216
; GFX11-FLUSH-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v2
221-
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v3, v5 op_sel_hi:[1,0,1]
222-
; GFX11-FLUSH-TRUE16-NEXT: v_fmac_f32_e32 v3, v6, v2
223-
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v4, -v4, v3, v5 op_sel_hi:[1,0,1]
217+
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
218+
; GFX11-FLUSH-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2
219+
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
224220
; GFX11-FLUSH-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v2
225221
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2
226222
; GFX11-FLUSH-TRUE16-NEXT: v_add_f32_e32 v2, v2, v3
@@ -491,14 +487,12 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
491487
; GFX11-IEEE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492488
; GFX11-IEEE-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
493489
; GFX11-IEEE-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
494-
; GFX11-IEEE-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
495-
; GFX11-IEEE-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
496490
; GFX11-IEEE-TRUE16-NEXT: v_rcp_f32_e32 v2, v2
497491
; GFX11-IEEE-TRUE16-NEXT: s_waitcnt_depctr 0xfff
498492
; GFX11-IEEE-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v2
499-
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v3, v5 op_sel_hi:[1,0,1]
500-
; GFX11-IEEE-TRUE16-NEXT: v_fmac_f32_e32 v3, v6, v2
501-
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v4, -v4, v3, v5 op_sel_hi:[1,0,1]
493+
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
494+
; GFX11-IEEE-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2
495+
; GFX11-IEEE-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
502496
; GFX11-IEEE-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v2
503497
; GFX11-IEEE-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2
504498
; GFX11-IEEE-TRUE16-NEXT: v_add_f32_e32 v2, v2, v3
@@ -529,14 +523,12 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
529523
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
530524
; GFX11-FLUSH-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l
531525
; GFX11-FLUSH-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
532-
; GFX11-FLUSH-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
533-
; GFX11-FLUSH-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
534526
; GFX11-FLUSH-TRUE16-NEXT: v_rcp_f32_e32 v2, v2
535527
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_depctr 0xfff
536528
; GFX11-FLUSH-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v2
537-
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v3, v5 op_sel_hi:[1,0,1]
538-
; GFX11-FLUSH-TRUE16-NEXT: v_fmac_f32_e32 v3, v6, v2
539-
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v4, -v4, v3, v5 op_sel_hi:[1,0,1]
529+
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
530+
; GFX11-FLUSH-TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v2
531+
; GFX11-FLUSH-TRUE16-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
540532
; GFX11-FLUSH-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v2
541533
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2
542534
; GFX11-FLUSH-TRUE16-NEXT: v_add_f32_e32 v2, v2, v3

0 commit comments

Comments
 (0)