Skip to content

Commit 91c8d4f

Browse files
committed
true16 for fold clamp
1 parent cf05b6e commit 91c8d4f

File tree

3 files changed

+74
-23
lines changed

3 files changed

+74
-23
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1721,6 +1721,10 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
17211721
return false;
17221722

17231723
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1724+
MachineInstr *OrigDef = Def;
1725+
// Look through COPY. COPY only observed with True16.
1726+
if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
1727+
Def = MRI->getVRegDef(Def->getOperand(1).getReg());
17241728

17251729
// The type of clamp must be compatible.
17261730
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1738,7 +1742,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
17381742
// Clamp is applied after omod, so it is OK if omod is set.
17391743
DefClamp->setImm(1);
17401744

1741-
Register DefReg = Def->getOperand(0).getReg();
1745+
Register DefReg = OrigDef->getOperand(0).getReg();
17421746
Register MIDstReg = MI.getOperand(0).getReg();
17431747
if (TRI->isSGPRReg(*MRI, DefReg)) {
17441748
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*

llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
269269
}
270270

271271
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
272-
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
273-
; SDAG-GFX1100-TRUE16: ; %bb.0:
274-
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275-
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
276-
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
277-
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
278-
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
279-
;
280-
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
281-
; SDAG-GFX1100-FAKE16: ; %bb.0:
282-
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283-
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
284-
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
272+
; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
273+
; GFX1100: ; %bb.0:
274+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275+
; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
276+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
285277
;
286278
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
287279
; GFX900: ; %bb.0:
@@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
312304
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
313305
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
314306
;
315-
; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
316-
; GISEL-GFX1100: ; %bb.0:
317-
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318-
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
319-
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
320-
;
321307
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
322308
; GISEL-CI: ; %bb.0:
323309
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1546,10 +1532,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
15461532
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
15471533
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
15481534
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1549-
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
1550-
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
1551-
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1535+
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
15521536
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
1537+
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
15531538
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
15541539
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
15551540
;
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr="+wavefrontsize32",+real-true16 -verify-machineinstrs -o - %s | FileCheck %s
3+
4+
---
5+
name: fold_16bit_subreg_1
6+
tracksRegLiveness: true
7+
registers:
8+
body: |
9+
bb.0.entry:
10+
; CHECK-LABEL: name: fold_16bit_subreg_1
11+
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
12+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].sub1_lo16
13+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
14+
; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, killed [[COPY]], 0, 0, implicit $mode, implicit $exec
15+
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]]
16+
%0:sreg_64_xexec = IMPLICIT_DEF
17+
%1:sgpr_lo16 = COPY %0.sub1_lo16:sreg_64_xexec
18+
%2:vgpr_16 = COPY %1:sgpr_lo16
19+
%3:vgpr_16 = IMPLICIT_DEF
20+
%4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec
21+
S_ENDPGM 0, implicit %4
22+
...
23+
24+
---
25+
name: fold_16bit_subreg_0
26+
tracksRegLiveness: true
27+
registers:
28+
body: |
29+
bb.0.entry:
30+
; CHECK-LABEL: name: fold_16bit_subreg_0
31+
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
32+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].lo16
33+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
34+
; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, killed [[COPY]], 0, 0, implicit $mode, implicit $exec
35+
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]]
36+
%0:sreg_64_xexec = IMPLICIT_DEF
37+
%1:sgpr_lo16 = COPY %0.lo16:sreg_64_xexec
38+
%2:vgpr_16 = COPY %1:sgpr_lo16
39+
%3:vgpr_16 = IMPLICIT_DEF
40+
%4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec
41+
S_ENDPGM 0, implicit %4
42+
...
43+
44+
---
45+
name: sgpr_lo16
46+
tracksRegLiveness: true
47+
registers:
48+
body: |
49+
bb.0.entry:
50+
; CHECK-LABEL: name: sgpr_lo16
51+
; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
52+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
53+
; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec
54+
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]]
55+
%0:sreg_32 = IMPLICIT_DEF
56+
%1:sreg_32 = IMPLICIT_DEF
57+
%2:sreg_32 = S_MOV_B32 30
58+
%3:sgpr_lo16 = COPY %2.lo16:sreg_32
59+
%4:vgpr_16 = COPY %3:sgpr_lo16
60+
%5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
61+
S_ENDPGM 0, implicit %5
62+
...

0 commit comments

Comments
 (0)