-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Optimize the register uses if offset inlinable #101676
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Optimize the register uses if offset inlinable #101676
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Pankaj Dwivedi (PankajDwivedi-25) ChangesFold the frame index offset into v_mad if inlinable. Full diff: https://github.com/llvm/llvm-project/pull/101676.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ba49f4a309ebb..14428bfb1ac45 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2568,26 +2568,39 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
} else
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
- TmpResultReg)
- .addImm(Offset);
assert(Offset > 0 &&
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
"offset is unsafe for v_mad_u32_u24");
- // We start with a frame pointer with a wave space value, and an
- // offset in lane-space. We are materializing a lane space
- // value. We can either do a right shift of the frame pointer to
- // get to lane space, or a left shift of the offset to get to
- // wavespace. We can right shift after the computation to get
- // back to the desired per-lane value.
- // We are using the mad_u32_u24 primarily as an add with no
- // carry out clobber.
+
+ // We start with a frame pointer with a wave space value, and
+ // an offset in lane-space. We are materializing a lane space
+ // value. We can either do a right shift of the frame pointer
+ // to get to lane space, or a left shift of the offset to get
+ // to wavespace. We can right shift after the computation to
+ // get back to the desired per-lane value. We are using the
+ // mad_u32_u24 primarily as an add with no carry out clobber.
+ bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32(
+ Offset, ST.hasInv2PiInlineImm());
+ if (!IsInlinableLiteral)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ TmpResultReg)
+ .addImm(Offset);
+
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
- TmpResultReg)
- .addReg(TmpResultReg, RegState::Kill)
- .addImm(ST.getWavefrontSize())
- .addReg(FrameReg)
- .addImm(0);
+ TmpResultReg);
+
+ if (!IsInlinableLiteral) {
+ Add.addReg(TmpResultReg, RegState::Kill)
+ .addImm(ST.getWavefrontSize())
+ .addReg(FrameReg)
+ .addImm(0);
+ } else {
+ // We fold the offset into mad itself if its inlinable.
+ Add.addImm(Offset)
+ .addImm(ST.getWavefrontSize())
+ .addReg(FrameReg)
+ .addImm(0);
+ }
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
TmpResultReg)
.addImm(ST.getWavefrontSizeLog2())
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 08c2904c601ad..acc84183b3a27 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -706,8 +706,7 @@ body: |
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
- ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
- ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 64, 64, $sgpr32, 0, implicit $exec
; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec
; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 87cfaec208897..6346406fa8941 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -835,12 +835,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: v_writelane_b32 v21, s56, 25
; GFX7-NEXT: v_writelane_b32 v21, s57, 26
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_mov_b32_e32 v22, 16
; GFX7-NEXT: v_writelane_b32 v21, s58, 27
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_mad_u32_u24 v22, v22, 64, s32
+; GFX7-NEXT: v_mad_u32_u24 v22, 16, 64, s32
; GFX7-NEXT: v_lshrrev_b32_e32 v22, 6, v22
; GFX7-NEXT: v_writelane_b32 v21, s59, 28
; GFX7-NEXT: v_readfirstlane_b32 s59, v22
@@ -918,12 +917,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: v_writelane_b32 v21, s56, 25
; GFX8-NEXT: v_writelane_b32 v21, s57, 26
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_mov_b32_e32 v22, 16
; GFX8-NEXT: v_writelane_b32 v21, s58, 27
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_mad_u32_u24 v22, v22, 64, s32
+; GFX8-NEXT: v_mad_u32_u24 v22, 16, 64, s32
; GFX8-NEXT: v_lshrrev_b32_e32 v22, 6, v22
; GFX8-NEXT: v_writelane_b32 v21, s59, 28
; GFX8-NEXT: v_readfirstlane_b32 s59, v22
|
// mad_u32_u24 primarily as an add with no carry out clobber. | ||
bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32( | ||
Offset, ST.hasInv2PiInlineImm()); | ||
if (!IsInlinableLiteral) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Braces
TmpResultReg); | ||
|
||
if (!IsInlinableLiteral) { | ||
Add.addReg(TmpResultReg, RegState::Kill) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can conditionally add this one operand and then share the rest of the .adds
@@ -706,8 +706,7 @@ body: | | |||
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 | |||
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec | |||
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc | |||
; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec | |||
; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec | |||
; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 64, 64, $sgpr32, 0, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be a good idea to change the size of stack.0 to something different from the wavesize so it's clear we didn't get the right answer by accident
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that's really great point, adding another test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need to add one, just change the size of stack.0 to 60 or something
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure
3aa3c7a
to
e110734
Compare
Fold the frame index offset into v_mad if inlinable.