Skip to content

Commit 48ac846

Browse files
authored
[AMDGPU][GlobalISel] Align selectVOP3PMadMixModsImpl with the SelectionDAG counterpart (llvm#110168)
The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction that violates constant bus restriction, while its `SelectionDAG` counterpart doesn't. The culprit is in the copy stripping while the `SelectionDAG` version only has a bitcast stripping. This PR simply aligns the two version.
1 parent a7968f6 commit 48ac846

File tree

7 files changed

+321
-22
lines changed

7 files changed

+321
-22
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
53125312
// Only change Src if src modifier could be gained. In such cases new Src
53135313
// could be sgpr but this does not violate constant bus restriction for
53145314
// instruction that is being selected.
5315-
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
5316-
// since this could violate constant bus restriction.
5317-
Register PeekSrc = stripCopy(Src, *MRI);
5315+
Src = stripBitCast(Src, *MRI);
53185316

53195317
const auto CheckAbsNeg = [&]() {
53205318
// Be careful about folding modifiers if we already have an abs. fneg is
53215319
// applied last, so we don't want to apply an earlier fneg.
53225320
if ((Mods & SISrcMods::ABS) == 0) {
53235321
unsigned ModsTmp;
5324-
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
5322+
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
53255323

5326-
if ((ModsTmp & SISrcMods::NEG) != 0) {
5324+
if ((ModsTmp & SISrcMods::NEG) != 0)
53275325
Mods ^= SISrcMods::NEG;
5328-
Src = PeekSrc;
5329-
}
53305326

5331-
if ((ModsTmp & SISrcMods::ABS) != 0) {
5327+
if ((ModsTmp & SISrcMods::ABS) != 0)
53325328
Mods |= SISrcMods::ABS;
5333-
Src = PeekSrc;
5334-
}
53355329
}
53365330
};
53375331

@@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
53445338

53455339
Mods |= SISrcMods::OP_SEL_1;
53465340

5347-
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
5348-
Src = PeekSrc;
5341+
if (isExtractHiElt(*MRI, Src, Src)) {
53495342
Mods |= SISrcMods::OP_SEL_0;
53505343
CheckAbsNeg();
53515344
}

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
7373
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8
7474
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
7575
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
76+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16
77+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16
78+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16
79+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16
7680
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
77-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
81+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
7882
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
79-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
83+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
8084
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
8185
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
8286
.entry:
@@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
117121
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
118122
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
119123
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11
124+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16
125+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16
126+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16
127+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16
128+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16
129+
; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16
120130
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
121-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
131+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
122132
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
123-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
133+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
124134
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
125-
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
135+
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
126136
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
127137
.entry:
128138
%a = fmul fast <6 x half> %x, %y

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
25552555
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
25562556
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
25572557
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
2558-
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2559-
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
2560-
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
2558+
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
2559+
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
2560+
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
25612561
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
25622562
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
25632563
; GFX9-FLUSH-NEXT: ; return to shader part epilog
@@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
25712571
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
25722572
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
25732573
; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2574-
; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2574+
; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
25752575
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
25762576
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
25772577
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
25882588
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
25892589
; GFX11-NEXT: s_waitcnt_depctr 0xfff
25902590
; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2591-
; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2591+
; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
25922592
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
25932593
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
25942594
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s
3+
4+
define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
5+
; CHECK-LABEL: test_fmamix_constant_bus_violation_sss:
6+
; CHECK: ; %bb.0:
7+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
9+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
10+
; CHECK-NEXT: s_lshr_b32 s6, s16, 16
11+
; CHECK-NEXT: v_mov_b32_e32 v0, s5
12+
; CHECK-NEXT: v_mov_b32_e32 v1, s6
13+
; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1]
14+
; CHECK-NEXT: s_setpc_b64 s[30:31]
15+
%lshr.0 = lshr i32 %val.0, 16
16+
%lshr.1 = lshr i32 %val.1, 16
17+
%lshr.2 = lshr i32 %val.2, 16
18+
%trunc.0 = trunc i32 %lshr.0 to i16
19+
%trunc.1 = trunc i32 %lshr.1 to i16
20+
%trunc.2 = trunc i32 %lshr.2 to i16
21+
%cast.0 = bitcast i16 %trunc.0 to half
22+
%cast.1 = bitcast i16 %trunc.1 to half
23+
%cast.2 = bitcast i16 %trunc.2 to half
24+
%fpext.0 = fpext half %cast.0 to float
25+
%fpext.1 = fpext half %cast.1 to float
26+
%fpext.2 = fpext half %cast.2 to float
27+
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
28+
ret float %fma
29+
}
30+
31+
define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg %val.1, i32 %val.2) #0 {
32+
; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv:
33+
; CHECK: ; %bb.0:
34+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
36+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
37+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
38+
; CHECK-NEXT: v_fma_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1]
39+
; CHECK-NEXT: s_setpc_b64 s[30:31]
40+
%lshr.0 = lshr i32 %val.0, 16
41+
%lshr.1 = lshr i32 %val.1, 16
42+
%lshr.2 = lshr i32 %val.2, 16
43+
%trunc.0 = trunc i32 %lshr.0 to i16
44+
%trunc.1 = trunc i32 %lshr.1 to i16
45+
%trunc.2 = trunc i32 %lshr.2 to i16
46+
%cast.0 = bitcast i16 %trunc.0 to half
47+
%cast.1 = bitcast i16 %trunc.1 to half
48+
%cast.2 = bitcast i16 %trunc.2 to half
49+
%fpext.0 = fpext half %cast.0 to float
50+
%fpext.1 = fpext half %cast.1 to float
51+
%fpext.2 = fpext half %cast.2 to float
52+
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
53+
ret float %fma
54+
}
55+
56+
define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val.1, i32 inreg %val.2) #0 {
57+
; CHECK-LABEL: test_fmamix_constant_bus_violation_svs:
58+
; CHECK: ; %bb.0:
59+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
61+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
62+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
63+
; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1]
64+
; CHECK-NEXT: s_setpc_b64 s[30:31]
65+
%lshr.0 = lshr i32 %val.0, 16
66+
%lshr.1 = lshr i32 %val.1, 16
67+
%lshr.2 = lshr i32 %val.2, 16
68+
%trunc.0 = trunc i32 %lshr.0 to i16
69+
%trunc.1 = trunc i32 %lshr.1 to i16
70+
%trunc.2 = trunc i32 %lshr.2 to i16
71+
%cast.0 = bitcast i16 %trunc.0 to half
72+
%cast.1 = bitcast i16 %trunc.1 to half
73+
%cast.2 = bitcast i16 %trunc.2 to half
74+
%fpext.0 = fpext half %cast.0 to float
75+
%fpext.1 = fpext half %cast.1 to float
76+
%fpext.2 = fpext half %cast.2 to float
77+
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
78+
ret float %fma
79+
}
80+
81+
define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
82+
; CHECK-LABEL: test_fmamix_constant_bus_violation_vss:
83+
; CHECK: ; %bb.0:
84+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
86+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
87+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
88+
; CHECK-NEXT: v_fma_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
89+
; CHECK-NEXT: s_setpc_b64 s[30:31]
90+
%lshr.0 = lshr i32 %val.0, 16
91+
%lshr.1 = lshr i32 %val.1, 16
92+
%lshr.2 = lshr i32 %val.2, 16
93+
%trunc.0 = trunc i32 %lshr.0 to i16
94+
%trunc.1 = trunc i32 %lshr.1 to i16
95+
%trunc.2 = trunc i32 %lshr.2 to i16
96+
%cast.0 = bitcast i16 %trunc.0 to half
97+
%cast.1 = bitcast i16 %trunc.1 to half
98+
%cast.2 = bitcast i16 %trunc.2 to half
99+
%fpext.0 = fpext half %cast.0 to float
100+
%fpext.1 = fpext half %cast.1 to float
101+
%fpext.2 = fpext half %cast.2 to float
102+
%fma = call float @llvm.fma.f32(float %fpext.0, float %fpext.1, float %fpext.2)
103+
ret float %fma
104+
}
105+
106+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign" }
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
3+
4+
---
5+
name: foo
6+
legalized: true
7+
regBankSelected: true
8+
machineFunctionInfo:
9+
mode:
10+
fp32-output-denormals: false
11+
fp32-input-denormals: false
12+
body: |
13+
bb.0:
14+
; GFX9-LABEL: name: foo
15+
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
16+
; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
17+
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
18+
; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
19+
; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
20+
; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
21+
; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
22+
; GFX9-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
23+
; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_FMA_MIX_F32_]], implicit $exec
24+
; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
25+
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
26+
%0:sgpr(s32) = COPY $sgpr0
27+
%1:sgpr(s32) = COPY $sgpr1
28+
%2:sgpr(s32) = G_CONSTANT i32 16
29+
%3:sgpr(s32) = G_LSHR %0:sgpr, %2:sgpr(s32)
30+
%4:sgpr(s16) = G_TRUNC %3:sgpr(s32)
31+
%5:sgpr(s32) = G_LSHR %1:sgpr, %2:sgpr(s32)
32+
%6:sgpr(s16) = G_TRUNC %5:sgpr(s32)
33+
%7:vgpr(s16) = COPY %4:sgpr(s16)
34+
%8:vgpr(s32) = G_FPEXT %7:vgpr(s16)
35+
%9:vgpr(s16) = COPY %6:sgpr(s16)
36+
%10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
37+
%11:vgpr(s32) = G_FNEG %10:vgpr
38+
%12:vgpr(s32) = G_FMA %11:vgpr, %10:vgpr, %8:vgpr
39+
%13:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %12:vgpr(s32)
40+
$sgpr0 = COPY %13:sgpr(s32)
41+
SI_RETURN_TO_EPILOG implicit $sgpr0
42+
...
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s
3+
4+
define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
5+
; CHECK-LABEL: test_fmamix_constant_bus_violation_sss:
6+
; CHECK: ; %bb.0:
7+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
9+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
10+
; CHECK-NEXT: s_lshr_b32 s6, s16, 16
11+
; CHECK-NEXT: v_mov_b32_e32 v0, s5
12+
; CHECK-NEXT: v_mov_b32_e32 v1, s6
13+
; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1]
14+
; CHECK-NEXT: s_setpc_b64 s[30:31]
15+
%lshr.0 = lshr i32 %val.0, 16
16+
%lshr.1 = lshr i32 %val.1, 16
17+
%lshr.2 = lshr i32 %val.2, 16
18+
%trunc.0 = trunc i32 %lshr.0 to i16
19+
%trunc.1 = trunc i32 %lshr.1 to i16
20+
%trunc.2 = trunc i32 %lshr.2 to i16
21+
%cast.0 = bitcast i16 %trunc.0 to half
22+
%cast.1 = bitcast i16 %trunc.1 to half
23+
%cast.2 = bitcast i16 %trunc.2 to half
24+
%fpext.0 = fpext half %cast.0 to float
25+
%fpext.1 = fpext half %cast.1 to float
26+
%fpext.2 = fpext half %cast.2 to float
27+
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
28+
ret float %fma
29+
}
30+
31+
define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg %val.1, i32 %val.2) #0 {
32+
; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv:
33+
; CHECK: ; %bb.0:
34+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
36+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
37+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
38+
; CHECK-NEXT: v_mad_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1]
39+
; CHECK-NEXT: s_setpc_b64 s[30:31]
40+
%lshr.0 = lshr i32 %val.0, 16
41+
%lshr.1 = lshr i32 %val.1, 16
42+
%lshr.2 = lshr i32 %val.2, 16
43+
%trunc.0 = trunc i32 %lshr.0 to i16
44+
%trunc.1 = trunc i32 %lshr.1 to i16
45+
%trunc.2 = trunc i32 %lshr.2 to i16
46+
%cast.0 = bitcast i16 %trunc.0 to half
47+
%cast.1 = bitcast i16 %trunc.1 to half
48+
%cast.2 = bitcast i16 %trunc.2 to half
49+
%fpext.0 = fpext half %cast.0 to float
50+
%fpext.1 = fpext half %cast.1 to float
51+
%fpext.2 = fpext half %cast.2 to float
52+
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
53+
ret float %fma
54+
}
55+
56+
define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val.1, i32 inreg %val.2) #0 {
57+
; CHECK-LABEL: test_fmamix_constant_bus_violation_svs:
58+
; CHECK: ; %bb.0:
59+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
61+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
62+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
63+
; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1]
64+
; CHECK-NEXT: s_setpc_b64 s[30:31]
65+
%lshr.0 = lshr i32 %val.0, 16
66+
%lshr.1 = lshr i32 %val.1, 16
67+
%lshr.2 = lshr i32 %val.2, 16
68+
%trunc.0 = trunc i32 %lshr.0 to i16
69+
%trunc.1 = trunc i32 %lshr.1 to i16
70+
%trunc.2 = trunc i32 %lshr.2 to i16
71+
%cast.0 = bitcast i16 %trunc.0 to half
72+
%cast.1 = bitcast i16 %trunc.1 to half
73+
%cast.2 = bitcast i16 %trunc.2 to half
74+
%fpext.0 = fpext half %cast.0 to float
75+
%fpext.1 = fpext half %cast.1 to float
76+
%fpext.2 = fpext half %cast.2 to float
77+
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
78+
ret float %fma
79+
}
80+
81+
define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
82+
; CHECK-LABEL: test_fmamix_constant_bus_violation_vss:
83+
; CHECK: ; %bb.0:
84+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85+
; CHECK-NEXT: s_lshr_b32 s5, s7, 16
86+
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
87+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
88+
; CHECK-NEXT: v_mad_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1]
89+
; CHECK-NEXT: s_setpc_b64 s[30:31]
90+
%lshr.0 = lshr i32 %val.0, 16
91+
%lshr.1 = lshr i32 %val.1, 16
92+
%lshr.2 = lshr i32 %val.2, 16
93+
%trunc.0 = trunc i32 %lshr.0 to i16
94+
%trunc.1 = trunc i32 %lshr.1 to i16
95+
%trunc.2 = trunc i32 %lshr.2 to i16
96+
%cast.0 = bitcast i16 %trunc.0 to half
97+
%cast.1 = bitcast i16 %trunc.1 to half
98+
%cast.2 = bitcast i16 %trunc.2 to half
99+
%fpext.0 = fpext half %cast.0 to float
100+
%fpext.1 = fpext half %cast.1 to float
101+
%fpext.2 = fpext half %cast.2 to float
102+
%fma = call float @llvm.fmuladd.f32(float %fpext.0, float %fpext.1, float %fpext.2)
103+
ret float %fma
104+
}
105+
106+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign" }

0 commit comments

Comments
 (0)