-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] fold clamp update for true16 #128919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesCheck through COPY for possible clamp folding for v_mad_mixhi_f16 isel Full diff: https://github.com/llvm/llvm-project/pull/128919.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index eb9aabf8b6317..9d4a7f30baa29 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1721,6 +1721,10 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
return false;
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+ MachineInstr *OrigDef = Def;
+ // Look through COPY. COPY only observed with True16.
+ if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
+ Def = MRI->getVRegDef(Def->getOperand(1).getReg());
// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1738,7 +1742,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
- Register DefReg = Def->getOperand(0).getReg();
+ Register DefReg = OrigDef->getOperand(0).getReg();
Register MIDstReg = MI.getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, DefReg)) {
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 9949b823dfec1..51965acec9573 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
-; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; SDAG-GFX1100-TRUE16: ; %bb.0:
-; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; SDAG-GFX1100-FAKE16: ; %bb.0:
-; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GFX900: ; %bb.0:
@@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; GISEL-GFX1100: ; %bb.0:
-; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1546,10 +1532,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
|
ping! |
✅ With the latest revision this PR passed the C/C++ code formatter. |
2982268
to
2cf303b
Compare
CI error is not related |
ping! |
2cf303b
to
3dad0c3
Compare
rebased |
12a300c
to
dc5dbec
Compare
All comments on this PR has been address. If there is no other comments on it I will merge this tomorrow. Thanks! |
rebased |
%3:sreg_32 = IMPLICIT_DEF | ||
%4:vgpr_32 = COPY %3 | ||
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec | ||
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice to make this case and the other subreg case fold, as this is a more correct MIR than in fold_16bit_madmix_clamp (COPY of different size src and dst in that one)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can keep it like this as we should not hit this case anyway.
We currently have the problem of different size COPY and thus we actually hit the case in line 5. We will need to support the true16 version of V_FMA_MIXLO_F16 to fix the COPY issue and thus we will have vpgr16 return without subreg.
d750f87
to
4842f7c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
rebased and resolved conflicts |
Check through COPY for possible clamp folding for v_mad_mixhi_f16 isel