Skip to content

Commit 3e2b5c8

Browse files
committed
true16 for fold clamp
1 parent e92ff64 commit 3e2b5c8

File tree

3 files changed

+45
-23
lines changed

3 files changed

+45
-23
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1820,6 +1820,10 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
18201820
return false;
18211821

18221822
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1823+
MachineInstr *OrigDef = Def;
1824+
// Look through COPY. COPY only observed with True16.
1825+
if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
1826+
Def = MRI->getVRegDef(Def->getOperand(1).getReg());
18231827

18241828
// The type of clamp must be compatible.
18251829
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1837,7 +1841,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
18371841
// Clamp is applied after omod, so it is OK if omod is set.
18381842
DefClamp->setImm(1);
18391843

1840-
Register DefReg = Def->getOperand(0).getReg();
1844+
Register DefReg = OrigDef->getOperand(0).getReg();
18411845
Register MIDstReg = MI.getOperand(0).getReg();
18421846
if (TRI->isSGPRReg(*MRI, DefReg)) {
18431847
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*

llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
269269
}
270270

271271
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
272-
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
273-
; SDAG-GFX1100-TRUE16: ; %bb.0:
274-
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275-
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
276-
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
277-
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
278-
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
279-
;
280-
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
281-
; SDAG-GFX1100-FAKE16: ; %bb.0:
282-
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283-
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
284-
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
272+
; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
273+
; GFX1100: ; %bb.0:
274+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275+
; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
276+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
285277
;
286278
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
287279
; GFX900: ; %bb.0:
@@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
312304
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
313305
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
314306
;
315-
; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
316-
; GISEL-GFX1100: ; %bb.0:
317-
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318-
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
319-
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
320-
;
321307
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
322308
; GISEL-CI: ; %bb.0:
323309
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
15241510
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
15251511
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
15261512
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1527-
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
1528-
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
1529-
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1513+
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
15301514
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
1515+
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
15311516
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
15321517
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
15331518
;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr="+wavefrontsize32",+real-true16 -verify-machineinstrs -o - %s | FileCheck %s
3+
4+
---
5+
name: fold_16bit_madmix_clamp
6+
tracksRegLiveness: true
7+
registers:
8+
body: |
9+
bb.0.entry:
10+
liveins: $vgpr0, $vgpr1, $vgpr2
11+
; CHECK-LABEL: name: fold_16bit_madmix_clamp
12+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
15+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
16+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
17+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
18+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
19+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
20+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
21+
; CHECK-NEXT: $vgpr0 = COPY [[COPY4]]
22+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
23+
%10:vgpr_32 = COPY $vgpr2
24+
%9:vgpr_32 = COPY $vgpr1
25+
%8:vgpr_32 = COPY $vgpr0
26+
%12:sreg_32 = IMPLICIT_DEF
27+
%13:vgpr_32 = COPY %12:sreg_32
28+
%11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec
29+
%15:vgpr_16 = COPY %11:vgpr_32
30+
%14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
31+
$vgpr0 = COPY %14:vgpr_16
32+
S_ENDPGM 0, implicit $vgpr0
33+
...

0 commit comments

Comments
 (0)