Skip to content

[AMDGPU][GlobalISel] Align selectVOP3PMadMixModsImpl with the SelectionDAG counterpart #110168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
// since this could violate constant bus restriction.
Register PeekSrc = stripCopy(Src, *MRI);
Src = stripBitCast(Src, *MRI);

const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);

if ((ModsTmp & SISrcMods::NEG) != 0) {
if ((ModsTmp & SISrcMods::NEG) != 0)
Mods ^= SISrcMods::NEG;
Src = PeekSrc;
}

if ((ModsTmp & SISrcMods::ABS) != 0) {
if ((ModsTmp & SISrcMods::ABS) != 0)
Mods |= SISrcMods::ABS;
Src = PeekSrc;
}
}
};

Expand All @@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,

Mods |= SISrcMods::OP_SEL_1;

if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
Src = PeekSrc;
if (isExtractHiElt(*MRI, Src, Src)) {
Mods |= SISrcMods::OP_SEL_0;
CheckAbsNeg();
}
Expand Down
20 changes: 15 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This regressed gfx10 which has the more relaxed constant bus limit

Copy link
Contributor Author

@shiltian shiltian Oct 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The v_fma_mix_f32 instruction still uses two SGPRs. The only issue here is, it can't see "through" the copies such that it can't directly use the source operands of the shifts via op_sel. I think the ISel version also has this limitation (apparently because here we "copied" the logic).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still should fix this in a follow up

; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
Expand Down Expand Up @@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul fast <6 x half> %x, %y
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
Expand All @@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
Expand All @@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
Expand Down
106 changes: 106 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s

define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_sss:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s6, s16, 16
; CHECK-NEXT: v_mov_b32_e32 v0, s5
; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg %val.1, i32 %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_fma_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val.1, i32 inreg %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_svs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_vss:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_fma_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

attributes #0 = { "denormal-fp-math-f32"="preserve-sign" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we have end-to-end IR tests, can I just remove this two MIR tests?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should keep both probably


---
name: foo
legalized: true
regBankSelected: true
machineFunctionInfo:
mode:
fp32-output-denormals: false
fp32-input-denormals: false
body: |
bb.0:
; GFX9-LABEL: name: foo
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
; GFX9-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_FMA_MIX_F32_]], implicit $exec
; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
%2:sgpr(s32) = G_CONSTANT i32 16
%3:sgpr(s32) = G_LSHR %0:sgpr, %2:sgpr(s32)
%4:sgpr(s16) = G_TRUNC %3:sgpr(s32)
%5:sgpr(s32) = G_LSHR %1:sgpr, %2:sgpr(s32)
%6:sgpr(s16) = G_TRUNC %5:sgpr(s32)
%7:vgpr(s16) = COPY %4:sgpr(s16)
%8:vgpr(s32) = G_FPEXT %7:vgpr(s16)
%9:vgpr(s16) = COPY %6:sgpr(s16)
%10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
%11:vgpr(s32) = G_FNEG %10:vgpr
%12:vgpr(s32) = G_FMA %11:vgpr, %10:vgpr, %8:vgpr
%13:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %12:vgpr(s32)
$sgpr0 = COPY %13:sgpr(s32)
SI_RETURN_TO_EPILOG implicit $sgpr0
...
106 changes: 106 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s

define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_sss:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s6, s16, 16
; CHECK-NEXT: v_mov_b32_e32 v0, s5
; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg %val.1, i32 %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_mad_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val.1, i32 inreg %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_svs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
; CHECK-LABEL: test_fmamix_constant_bus_violation_vss:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_mad_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
; CHECK-NEXT: s_setpc_b64 s[30:31]
%lshr.0 = lshr i32 %val.0, 16
%lshr.1 = lshr i32 %val.1, 16
%lshr.2 = lshr i32 %val.2, 16
%trunc.0 = trunc i32 %lshr.0 to i16
%trunc.1 = trunc i32 %lshr.1 to i16
%trunc.2 = trunc i32 %lshr.2 to i16
%cast.0 = bitcast i16 %trunc.0 to half
%cast.1 = bitcast i16 %trunc.1 to half
%cast.2 = bitcast i16 %trunc.2 to half
%fpext.0 = fpext half %cast.0 to float
%fpext.1 = fpext half %cast.1 to float
%fpext.2 = fpext half %cast.2 to float
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
ret float %fma
}

attributes #0 = { "denormal-fp-math-f32"="preserve-sign" }
Loading
Loading