-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU][GlobalISel] Align selectVOP3PMadMixModsImpl
with the SelectionDAG
counterpart
#110168
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesThe current Full diff: https://github.com/llvm/llvm-project/pull/110168.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index febf0711c7d574..c3df962d448f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
- // Note: Src is not changed when there is only a simple sgpr to vgpr copy
- // since this could violate constant bus restriction.
- Register PeekSrc = stripCopy(Src, *MRI);
+ Src = stripBitCast(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
- std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
+ std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
- if ((ModsTmp & SISrcMods::NEG) != 0) {
+ if ((ModsTmp & SISrcMods::NEG) != 0)
Mods ^= SISrcMods::NEG;
- Src = PeekSrc;
- }
- if ((ModsTmp & SISrcMods::ABS) != 0) {
+ if ((ModsTmp & SISrcMods::ABS) != 0)
Mods |= SISrcMods::ABS;
- Src = PeekSrc;
- }
}
};
@@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
- Src = PeekSrc;
+ if (isExtractHiElt(*MRI, Src, Src)) {
Mods |= SISrcMods::OP_SEL_0;
CheckAbsNeg();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4ebe1c499a1769..4d603f7487754a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
@@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul fast <6 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 89cd18ad9be70b..1a98285230b2cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
@@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
new file mode 100644
index 00000000000000..ba9cfb8e1d68ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name: s_fdiv_v2f16
+legalized: true
+regBankSelected: true
+machineFunctionInfo:
+ mode:
+ fp32-output-denormals: false
+ fp32-input-denormals: false
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: s_fdiv_v2f16
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_CVT_F32_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_CVT_F32_F16_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 0, [[V_MUL_F32_e64_]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[V_MAD_MIX_F32_]], 0, [[V_RCP_F32_e64_]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_1:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 0, [[V_MAC_F32_e64_]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MAD_MIX_F32_1]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_MUL_F32_e64_1]], [[COPY4]], implicit $exec
+ ; GFX9-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_AND_B32_e64_]], 0, [[V_MAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX9-NEXT: [[V_DIV_FIXUP_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_gfx9_e64 0, [[V_CVT_F16_F32_e64_]], 0, [[COPY5]], 0, [[COPY6]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY8]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_RCP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_CVT_F32_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_CVT_F32_F16_e64_2]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_2:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY8]], 0, [[V_MUL_F32_e64_2]], 8, [[COPY7]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[V_MAD_MIX_F32_2]], 0, [[V_RCP_F32_e64_1]], 0, [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_3:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY8]], 0, [[V_MAC_F32_e64_1]], 8, [[COPY7]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MAD_MIX_F32_3]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX9-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_MUL_F32_e64_3]], [[COPY9]], implicit $exec
+ ; GFX9-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_AND_B32_e64_1]], 0, [[V_MAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+ ; GFX9-NEXT: [[V_DIV_FIXUP_F16_gfx9_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_gfx9_e64 0, [[V_CVT_F16_F32_e64_1]], 0, [[COPY10]], 0, [[COPY11]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, [[V_DIV_FIXUP_F16_gfx9_e64_]], 0, [[V_DIV_FIXUP_F16_gfx9_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_PACK_B32_F16_e64_]], implicit $exec
+ ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s16) = G_TRUNC %0:sgpr(s32)
+ %3:sgpr(s32) = G_CONSTANT i32 16
+ %4:sgpr(s32) = G_LSHR %0:sgpr, %3:sgpr(s32)
+ %5:sgpr(s16) = G_TRUNC %4:sgpr(s32)
+ %6:sgpr(s16) = G_TRUNC %1:sgpr(s32)
+ %7:sgpr(s32) = G_LSHR %1:sgpr, %3:sgpr(s32)
+ %8:sgpr(s16) = G_TRUNC %7:sgpr(s32)
+ %9:vgpr(s16) = COPY %2:sgpr(s16)
+ %10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
+ %11:vgpr(s16) = COPY %6:sgpr(s16)
+ %12:vgpr(s32) = G_FPEXT %11:vgpr(s16)
+ %13:vgpr(s32) = G_FNEG %12:vgpr
+ %14:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %12:vgpr(s32)
+ %15:vgpr(s32) = G_FMUL %10:vgpr, %14:vgpr
+ %16:vgpr(s32) = G_FMAD %13:vgpr, %15:vgpr, %10:vgpr
+ %17:vgpr(s32) = G_FMAD %16:vgpr, %14:vgpr, %15:vgpr
+ %18:vgpr(s32) = G_FMAD %13:vgpr, %17:vgpr, %10:vgpr
+ %19:vgpr(s32) = G_FMUL %18:vgpr, %14:vgpr
+ %20:sgpr(s32) = G_CONSTANT i32 -8388608
+ %21:vgpr(s32) = COPY %20:sgpr(s32)
+ %22:vgpr(s32) = G_AND %19:vgpr, %21:vgpr
+ %23:vgpr(s32) = G_FADD %22:vgpr, %17:vgpr
+ %24:vgpr(s16) = G_FPTRUNC %23:vgpr(s32)
+ %25:vgpr(s16) = COPY %6:sgpr(s16)
+ %26:vgpr(s16) = COPY %2:sgpr(s16)
+ %27:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), %24:vgpr(s16), %25:vgpr(s16), %26:vgpr(s16)
+ %28:vgpr(s16) = COPY %5:sgpr(s16)
+ %29:vgpr(s32) = G_FPEXT %28:vgpr(s16)
+ %30:vgpr(s16) = COPY %8:sgpr(s16)
+ %31:vgpr(s32) = G_FPEXT %30:vgpr(s16)
+ %32:vgpr(s32) = G_FNEG %31:vgpr
+ %33:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %31:vgpr(s32)
+ %34:vgpr(s32) = G_FMUL %29:vgpr, %33:vgpr
+ %35:vgpr(s32) = G_FMAD %32:vgpr, %34:vgpr, %29:vgpr
+ %36:vgpr(s32) = G_FMAD %35:vgpr, %33:vgpr, %34:vgpr
+ %37:vgpr(s32) = G_FMAD %32:vgpr, %36:vgpr, %29:vgpr
+ %38:vgpr(s32) = G_FMUL %37:vgpr, %33:vgpr
+ %39:vgpr(s32) = COPY %20:sgpr(s32)
+ %40:vgpr(s32) = G_AND %38:vgpr, %39:vgpr
+ %41:vgpr(s32) = G_FADD %40:vgpr, %36:vgpr
+ %42:vgpr(s16) = G_FPTRUNC %41:vgpr(s32)
+ %43:vgpr(s16) = COPY %8:sgpr(s16)
+ %44:vgpr(s16) = COPY %5:sgpr(s16)
+ %45:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), %42:vgpr(s16), %43:vgpr(s16), %44:vgpr(s16)
+ %46:vgpr(<2 x s16>) = G_BUILD_VECTOR %27:vgpr(s16), %45:vgpr(s16)
+ %47:vgpr(s32) = G_BITCAST %46:vgpr(<2 x s16>)
+ %48:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %47:vgpr(s32)
+ $sgpr0 = COPY %48:sgpr(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0
+...
|
@llvm/pr-subscribers-llvm-globalisel Author: Shilei Tian (shiltian) ChangesThe current Full diff: https://github.com/llvm/llvm-project/pull/110168.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index febf0711c7d574..c3df962d448f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
- // Note: Src is not changed when there is only a simple sgpr to vgpr copy
- // since this could violate constant bus restriction.
- Register PeekSrc = stripCopy(Src, *MRI);
+ Src = stripBitCast(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
- std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
+ std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
- if ((ModsTmp & SISrcMods::NEG) != 0) {
+ if ((ModsTmp & SISrcMods::NEG) != 0)
Mods ^= SISrcMods::NEG;
- Src = PeekSrc;
- }
- if ((ModsTmp & SISrcMods::ABS) != 0) {
+ if ((ModsTmp & SISrcMods::ABS) != 0)
Mods |= SISrcMods::ABS;
- Src = PeekSrc;
- }
}
};
@@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
- Src = PeekSrc;
+ if (isExtractHiElt(*MRI, Src, Src)) {
Mods |= SISrcMods::OP_SEL_0;
CheckAbsNeg();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4ebe1c499a1769..4d603f7487754a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
@@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul fast <6 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 89cd18ad9be70b..1a98285230b2cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
@@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
new file mode 100644
index 00000000000000..ba9cfb8e1d68ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name: s_fdiv_v2f16
+legalized: true
+regBankSelected: true
+machineFunctionInfo:
+ mode:
+ fp32-output-denormals: false
+ fp32-input-denormals: false
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: s_fdiv_v2f16
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_CVT_F32_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_CVT_F32_F16_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 0, [[V_MUL_F32_e64_]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[V_MAD_MIX_F32_]], 0, [[V_RCP_F32_e64_]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_1:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 0, [[V_MAC_F32_e64_]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MAD_MIX_F32_1]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_MUL_F32_e64_1]], [[COPY4]], implicit $exec
+ ; GFX9-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_AND_B32_e64_]], 0, [[V_MAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX9-NEXT: [[V_DIV_FIXUP_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_gfx9_e64 0, [[V_CVT_F16_F32_e64_]], 0, [[COPY5]], 0, [[COPY6]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY8]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_RCP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_CVT_F32_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_CVT_F32_F16_e64_2]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_2:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY8]], 0, [[V_MUL_F32_e64_2]], 8, [[COPY7]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[V_MAD_MIX_F32_2]], 0, [[V_RCP_F32_e64_1]], 0, [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_3:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY8]], 0, [[V_MAC_F32_e64_1]], 8, [[COPY7]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MAD_MIX_F32_3]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GFX9-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_MUL_F32_e64_3]], [[COPY9]], implicit $exec
+ ; GFX9-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_AND_B32_e64_1]], 0, [[V_MAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+ ; GFX9-NEXT: [[V_DIV_FIXUP_F16_gfx9_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_gfx9_e64 0, [[V_CVT_F16_F32_e64_1]], 0, [[COPY10]], 0, [[COPY11]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, [[V_DIV_FIXUP_F16_gfx9_e64_]], 0, [[V_DIV_FIXUP_F16_gfx9_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_PACK_B32_F16_e64_]], implicit $exec
+ ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s16) = G_TRUNC %0:sgpr(s32)
+ %3:sgpr(s32) = G_CONSTANT i32 16
+ %4:sgpr(s32) = G_LSHR %0:sgpr, %3:sgpr(s32)
+ %5:sgpr(s16) = G_TRUNC %4:sgpr(s32)
+ %6:sgpr(s16) = G_TRUNC %1:sgpr(s32)
+ %7:sgpr(s32) = G_LSHR %1:sgpr, %3:sgpr(s32)
+ %8:sgpr(s16) = G_TRUNC %7:sgpr(s32)
+ %9:vgpr(s16) = COPY %2:sgpr(s16)
+ %10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
+ %11:vgpr(s16) = COPY %6:sgpr(s16)
+ %12:vgpr(s32) = G_FPEXT %11:vgpr(s16)
+ %13:vgpr(s32) = G_FNEG %12:vgpr
+ %14:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %12:vgpr(s32)
+ %15:vgpr(s32) = G_FMUL %10:vgpr, %14:vgpr
+ %16:vgpr(s32) = G_FMAD %13:vgpr, %15:vgpr, %10:vgpr
+ %17:vgpr(s32) = G_FMAD %16:vgpr, %14:vgpr, %15:vgpr
+ %18:vgpr(s32) = G_FMAD %13:vgpr, %17:vgpr, %10:vgpr
+ %19:vgpr(s32) = G_FMUL %18:vgpr, %14:vgpr
+ %20:sgpr(s32) = G_CONSTANT i32 -8388608
+ %21:vgpr(s32) = COPY %20:sgpr(s32)
+ %22:vgpr(s32) = G_AND %19:vgpr, %21:vgpr
+ %23:vgpr(s32) = G_FADD %22:vgpr, %17:vgpr
+ %24:vgpr(s16) = G_FPTRUNC %23:vgpr(s32)
+ %25:vgpr(s16) = COPY %6:sgpr(s16)
+ %26:vgpr(s16) = COPY %2:sgpr(s16)
+ %27:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), %24:vgpr(s16), %25:vgpr(s16), %26:vgpr(s16)
+ %28:vgpr(s16) = COPY %5:sgpr(s16)
+ %29:vgpr(s32) = G_FPEXT %28:vgpr(s16)
+ %30:vgpr(s16) = COPY %8:sgpr(s16)
+ %31:vgpr(s32) = G_FPEXT %30:vgpr(s16)
+ %32:vgpr(s32) = G_FNEG %31:vgpr
+ %33:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %31:vgpr(s32)
+ %34:vgpr(s32) = G_FMUL %29:vgpr, %33:vgpr
+ %35:vgpr(s32) = G_FMAD %32:vgpr, %34:vgpr, %29:vgpr
+ %36:vgpr(s32) = G_FMAD %35:vgpr, %33:vgpr, %34:vgpr
+ %37:vgpr(s32) = G_FMAD %32:vgpr, %36:vgpr, %29:vgpr
+ %38:vgpr(s32) = G_FMUL %37:vgpr, %33:vgpr
+ %39:vgpr(s32) = COPY %20:sgpr(s32)
+ %40:vgpr(s32) = G_AND %38:vgpr, %39:vgpr
+ %41:vgpr(s32) = G_FADD %40:vgpr, %36:vgpr
+ %42:vgpr(s16) = G_FPTRUNC %41:vgpr(s32)
+ %43:vgpr(s16) = COPY %8:sgpr(s16)
+ %44:vgpr(s16) = COPY %5:sgpr(s16)
+ %45:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), %42:vgpr(s16), %43:vgpr(s16), %44:vgpr(s16)
+ %46:vgpr(<2 x s16>) = G_BUILD_VECTOR %27:vgpr(s16), %45:vgpr(s16)
+ %47:vgpr(s32) = G_BITCAST %46:vgpr(<2 x s16>)
+ %48:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %47:vgpr(s32)
+ $sgpr0 = COPY %48:sgpr(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0
+...
|
I understand that this seems to revert some optimization, so I'm happy to hear about better approaches. |
selectVOP3PMadMixModsImpl
same as the SelectionDAG counterpartselectVOP3PMadMixModsImpl
with the SelectionDAG
counterpart
4e5ee8a
to
7f665f0
Compare
IMO, the easiest approach is to force vgpr reg class on all operands when selecting vop3p instructions. Do this with constrainOperandRegClass, to vgpr reg class you can get from getVGPRClassForBitWidth. This will insert copy from sgpr to vgpr for all operands. |
8131471
to
b7e1fa3
Compare
IIUC, this PR made a right move by removing the copy stripping. |
What I meant was to constrainOperandRegClass to vgpr after adding sgpr operand to MIB. This way there is no need to avoid copy stripping and final asm code stays the same. Didn't try to implement it, so it may not be that easy. |
I added the following code into
|
Overall we can make it align with the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I trust this. It might be better to directly check the constant bus restriction, similar to how ThreeOpFrag does
llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
Outdated
Show resolved
Hide resolved
llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
Outdated
Show resolved
Hide resolved
Hi @shiltian, I did not look into the patch carefully, can you provide LLVM IR test that violates constant bus restriction first. What I had in mind was that you somehow end up with sgpr register after selectVOP3PMadMixModsImpl If you want to use existing functions you could
I added snippet that I would assume works, but don't have test for it. |
Which part?
I tried this as you suggested but because the pattern here is a |
38b55c0
to
9f1f15e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should have end to end IR test, including bitcasted cases
llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir
Outdated
Show resolved
Hide resolved
9f1f15e
to
494e9fa
Compare
Thanks @petar-avramovic . It indeed works. It might be better to do it in TableGen so I will probably make a follow-up patch. For now, I'd like to get the issue resolved such that another PR can be merged as well. |
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x, | |||
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8 | |||
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 | |||
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 | |||
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16 | |||
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This regressed gfx10 which has the more relaxed constant bus limit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The v_fma_mix_f32
instruction still uses two SGPRs. The only issue here is, it can't see "through" the copies such that it can't directly use the source operands of the shifts via op_sel
. I think the ISel version also has this limitation (apparently because here we "copied" the logic).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still should fix this in a follow up
llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll
Outdated
Show resolved
Hide resolved
llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll
Outdated
Show resolved
Hide resolved
…ectionDAG counterpart The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction that violates constant bus restriction, while its `SelectionDAG` counterpart doesn't. The culprit is in the copy stripping while the SelectionDAG version only has a bitcast stripping. This PR simply aligns the two version.
494e9fa
to
c5680dd
Compare
@@ -0,0 +1,42 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | |||
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we have end-to-end IR tests, can I just remove this two MIR tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should keep both probably
bump |
…ctionDAG` counterpart (llvm#110168) The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction that violates constant bus restriction, while its `SelectionDAG` counterpart doesn't. The culprit is in the copy stripping while the `SelectionDAG` version only has a bitcast stripping. This PR simply aligns the two version. (cherry picked from commit 48ac846) Change-Id: I33ca6f072d443b9571675d7bee864d7fb3c4e1cc
The current
selectVOP3PMadMixModsImpl
can produceV_MAD_FIX_F32
instructionthat violates constant bus restriction, while its
SelectionDAG
counterpartdoesn't. The culprit is in the copy stripping while the
SelectionDAG
versiononly has a bitcast stripping. This PR simply aligns the two version.