Skip to content

Commit 066787b

Browse files
authored
[AMDGPU][True16][CodeGen] fold clamp update for true16 (#128919)
Check through COPY for possible clamp folding for v_mad_mixhi_f16 isel
1 parent 38937ac commit 066787b

File tree

3 files changed

+163
-23
lines changed

3 files changed

+163
-23
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1899,7 +1899,13 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
18991899
if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
19001900
return false;
19011901

1902-
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1902+
if (!ClampSrc->getReg().isVirtual())
1903+
return false;
1904+
1905+
// Look through COPY. COPY only observed with True16.
1906+
Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
1907+
MachineInstr *Def =
1908+
MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
19031909

19041910
// The type of clamp must be compatible.
19051911
if (TII->getClampMask(*Def) != TII->getClampMask(MI))

llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
269269
}
270270

271271
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
272-
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
273-
; SDAG-GFX1100-TRUE16: ; %bb.0:
274-
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275-
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
276-
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
277-
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
278-
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
279-
;
280-
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
281-
; SDAG-GFX1100-FAKE16: ; %bb.0:
282-
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283-
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
284-
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
272+
; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
273+
; GFX1100: ; %bb.0:
274+
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275+
; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
276+
; GFX1100-NEXT: s_setpc_b64 s[30:31]
285277
;
286278
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
287279
; GFX900: ; %bb.0:
@@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
312304
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
313305
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
314306
;
315-
; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
316-
; GISEL-GFX1100: ; %bb.0:
317-
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318-
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
319-
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
320-
;
321307
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
322308
; GISEL-CI: ; %bb.0:
323309
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
15241510
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
15251511
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
15261512
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1527-
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
1528-
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
1529-
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1513+
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
15301514
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
1515+
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
15311516
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
15321517
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
15331518
;

llvm/test/CodeGen/AMDGPU/true16-fold.mir

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,153 @@ body: |
5757
%4:vgpr_16 = COPY %3:sgpr_lo16
5858
%5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
5959
S_ENDPGM 0, implicit %5
60+
61+
---
62+
name: fold_16bit_madmix_clamp
63+
tracksRegLiveness: true
64+
registers:
65+
body: |
66+
bb.0:
67+
liveins: $vgpr0, $vgpr1, $vgpr2
68+
; CHECK-LABEL: name: fold_16bit_madmix_clamp
69+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
70+
; CHECK-NEXT: {{ $}}
71+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
72+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
73+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
74+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
75+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
76+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
77+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
78+
; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]]
79+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
80+
%0:vgpr_32 = COPY $vgpr2
81+
%1:vgpr_32 = COPY $vgpr1
82+
%2:vgpr_32 = COPY $vgpr0
83+
%3:sreg_32 = IMPLICIT_DEF
84+
%4:vgpr_32 = COPY %3
85+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
86+
%6:vgpr_16 = COPY %5
87+
%7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
88+
$vgpr0 = COPY %7
89+
S_ENDPGM 0, implicit $vgpr0
90+
...
91+
92+
---
93+
name: fold_16bit_subreg_1_clamp
94+
tracksRegLiveness: true
95+
registers:
96+
body: |
97+
bb.0:
98+
liveins: $vgpr0, $vgpr1, $vgpr2
99+
; CHECK-LABEL: name: fold_16bit_subreg_1_clamp
100+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
101+
; CHECK-NEXT: {{ $}}
102+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
103+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
104+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
105+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
106+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
107+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
108+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
109+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
110+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
111+
%0:vgpr_32 = COPY $vgpr2
112+
%1:vgpr_32 = COPY $vgpr1
113+
%2:vgpr_32 = COPY $vgpr0
114+
%3:sreg_32 = IMPLICIT_DEF
115+
%4:vgpr_32 = COPY %3
116+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
117+
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec
118+
$vgpr0 = COPY %6
119+
S_ENDPGM 0, implicit $vgpr0
120+
...
121+
122+
---
123+
name: fold_16bit_subreg_2_clamp
124+
tracksRegLiveness: true
125+
registers:
126+
body: |
127+
bb.0:
128+
liveins: $vgpr0, $vgpr1, $vgpr2
129+
; CHECK-LABEL: name: fold_16bit_subreg_2_clamp
130+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
131+
; CHECK-NEXT: {{ $}}
132+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
133+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
134+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
135+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
136+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
137+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
138+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
139+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
140+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
141+
%0:vgpr_32 = COPY $vgpr2
142+
%1:vgpr_32 = COPY $vgpr1
143+
%2:vgpr_32 = COPY $vgpr0
144+
%3:sreg_32 = IMPLICIT_DEF
145+
%4:vgpr_32 = COPY %3
146+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
147+
%6:vgpr_16 = COPY %5.lo16
148+
%7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
149+
$vgpr0 = COPY %7
150+
S_ENDPGM 0, implicit $vgpr0
151+
...
152+
153+
---
154+
name: fold_16bit_phyreg_clamp
155+
tracksRegLiveness: true
156+
registers:
157+
body: |
158+
bb.0:
159+
liveins: $vgpr0, $vgpr1, $vgpr2
160+
; CHECK-LABEL: name: fold_16bit_phyreg_clamp
161+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
162+
; CHECK-NEXT: {{ $}}
163+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
164+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
165+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
166+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
167+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
168+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
169+
; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]]
170+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
171+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
172+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
173+
%0:vgpr_32 = COPY $vgpr2
174+
%1:vgpr_32 = COPY $vgpr1
175+
%2:vgpr_32 = COPY $vgpr0
176+
%3:sreg_32 = IMPLICIT_DEF
177+
%4:vgpr_32 = COPY %3
178+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
179+
$vgpr10_lo16 = COPY %5
180+
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
181+
$vgpr0 = COPY %6
182+
S_ENDPGM 0, implicit $vgpr0
183+
...
184+
185+
---
186+
name: fold_16bit_undef_clamp
187+
tracksRegLiveness: true
188+
registers:
189+
body: |
190+
bb.0:
191+
liveins: $vgpr0, $vgpr1, $vgpr2
192+
; CHECK-LABEL: name: fold_16bit_undef_clamp
193+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
194+
; CHECK-NEXT: {{ $}}
195+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
196+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
197+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
198+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
199+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec
200+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
201+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
202+
%0:vgpr_32 = COPY $vgpr2
203+
%1:vgpr_32 = COPY $vgpr1
204+
%2:vgpr_32 = COPY $vgpr0
205+
%3:vgpr_16 = IMPLICIT_DEF
206+
%4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec
207+
$vgpr0 = COPY %4
208+
S_ENDPGM 0, implicit $vgpr0
60209
...

0 commit comments

Comments
 (0)