Skip to content

Commit 42bae9c

Browse files
[AMDGPU] Optimize the register uses if offset inlinable (#101676)
Fold the frame index offset into v_mad if inlinable.
1 parent 4cfbd49 commit 42bae9c

File tree

3 files changed

+42
-35
lines changed

3 files changed

+42
-35
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2568,26 +2568,35 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25682568
} else
25692569
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
25702570
} else {
2571-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
2572-
TmpResultReg)
2573-
.addImm(Offset);
25742571
assert(Offset > 0 &&
25752572
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
25762573
"offset is unsafe for v_mad_u32_u24");
2577-
// We start with a frame pointer with a wave space value, and an
2578-
// offset in lane-space. We are materializing a lane space
2579-
// value. We can either do a right shift of the frame pointer to
2580-
// get to lane space, or a left shift of the offset to get to
2581-
// wavespace. We can right shift after the computation to get
2582-
// back to the desired per-lane value.
2583-
// We are using the mad_u32_u24 primarily as an add with no
2584-
// carry out clobber.
2574+
2575+
// We start with a frame pointer with a wave space value, and
2576+
// an offset in lane-space. We are materializing a lane space
2577+
// value. We can either do a right shift of the frame pointer
2578+
// to get to lane space, or a left shift of the offset to get
2579+
// to wavespace. We can right shift after the computation to
2580+
// get back to the desired per-lane value. We are using the
2581+
// mad_u32_u24 primarily as an add with no carry out clobber.
2582+
bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32(
2583+
Offset, ST.hasInv2PiInlineImm());
2584+
if (!IsInlinableLiteral) {
2585+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
2586+
TmpResultReg)
2587+
.addImm(Offset);
2588+
}
2589+
25852590
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
2586-
TmpResultReg)
2587-
.addReg(TmpResultReg, RegState::Kill)
2588-
.addImm(ST.getWavefrontSize())
2589-
.addReg(FrameReg)
2590-
.addImm(0);
2591+
TmpResultReg);
2592+
2593+
if (!IsInlinableLiteral) {
2594+
Add.addReg(TmpResultReg, RegState::Kill);
2595+
} else {
2596+
// We fold the offset into mad itself if its inlinable.
2597+
Add.addImm(Offset);
2598+
}
2599+
Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
25912600
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
25922601
TmpResultReg)
25932602
.addImm(ST.getWavefrontSizeLog2())

llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -681,10 +681,10 @@ body: |
681681

682682

683683
---
684-
name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
684+
name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
685685
tracksRegLiveness: true
686686
stack:
687-
- { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
687+
- { id: 0, type: default, size: 24, alignment: 16, stack-id: default }
688688
- { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
689689
machineFunctionInfo:
690690
stackPtrOffsetReg: '$sgpr32'
@@ -693,7 +693,7 @@ body: |
693693
liveins: $sgpr4, $sgpr5, $vgpr0
694694
695695
696-
; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
696+
; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
697697
; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
698698
; GFX8-NEXT: {{ $}}
699699
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -706,8 +706,7 @@ body: |
706706
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
707707
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
708708
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
709-
; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
710-
; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
709+
; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 24, 64, $sgpr32, 0, implicit $exec
711710
; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec
712711
; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
713712
; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -720,7 +719,7 @@ body: |
720719
; GFX8-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
721720
; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
722721
;
723-
; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
722+
; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
724723
; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
725724
; GFX900-NEXT: {{ $}}
726725
; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -734,7 +733,7 @@ body: |
734733
; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
735734
; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
736735
; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
737-
; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
736+
; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 24, killed $vgpr0, implicit $exec
738737
; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
739738
; GFX900-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
740739
; GFX900-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -746,7 +745,7 @@ body: |
746745
; GFX900-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
747746
; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
748747
;
749-
; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
748+
; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
750749
; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
751750
; GFX90A-NEXT: {{ $}}
752751
; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -760,7 +759,7 @@ body: |
760759
; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
761760
; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
762761
; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
763-
; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
762+
; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 24, killed $vgpr0, implicit $exec
764763
; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
765764
; GFX90A-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
766765
; GFX90A-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -772,7 +771,7 @@ body: |
772771
; GFX90A-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
773772
; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
774773
;
775-
; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
774+
; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
776775
; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
777776
; GFX1010-NEXT: {{ $}}
778777
; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -786,7 +785,7 @@ body: |
786785
; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
787786
; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
788787
; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
789-
; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
788+
; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 24, killed $vgpr0, implicit $exec
790789
; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
791790
; GFX1010-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
792791
; GFX1010-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -798,7 +797,7 @@ body: |
798797
; GFX1010-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
799798
; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
800799
;
801-
; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
800+
; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
802801
; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
803802
; GFX1100-NEXT: {{ $}}
804803
; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -811,7 +810,7 @@ body: |
811810
; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
812811
; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
813812
; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
814-
; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
813+
; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
815814
; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
816815
; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
817816
; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
@@ -825,7 +824,7 @@ body: |
825824
; GFX1100-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
826825
; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
827826
;
828-
; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
827+
; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
829828
; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
830829
; GFX1200-NEXT: {{ $}}
831830
; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -838,7 +837,7 @@ body: |
838837
; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
839838
; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
840839
; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
841-
; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
840+
; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
842841
; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
843842
; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
844843
; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
@@ -1070,3 +1069,4 @@ body: |
10701069
S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
10711070
10721071
...
1072+

llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -835,12 +835,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
835835
; GFX7-NEXT: v_writelane_b32 v21, s56, 25
836836
; GFX7-NEXT: v_writelane_b32 v21, s57, 26
837837
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
838-
; GFX7-NEXT: v_mov_b32_e32 v22, 16
839838
; GFX7-NEXT: v_writelane_b32 v21, s58, 27
840839
; GFX7-NEXT: ;;#ASMSTART
841840
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
842841
; GFX7-NEXT: ;;#ASMEND
843-
; GFX7-NEXT: v_mad_u32_u24 v22, v22, 64, s32
842+
; GFX7-NEXT: v_mad_u32_u24 v22, 16, 64, s32
844843
; GFX7-NEXT: v_lshrrev_b32_e32 v22, 6, v22
845844
; GFX7-NEXT: v_writelane_b32 v21, s59, 28
846845
; GFX7-NEXT: v_readfirstlane_b32 s59, v22
@@ -918,12 +917,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
918917
; GFX8-NEXT: v_writelane_b32 v21, s56, 25
919918
; GFX8-NEXT: v_writelane_b32 v21, s57, 26
920919
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
921-
; GFX8-NEXT: v_mov_b32_e32 v22, 16
922920
; GFX8-NEXT: v_writelane_b32 v21, s58, 27
923921
; GFX8-NEXT: ;;#ASMSTART
924922
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
925923
; GFX8-NEXT: ;;#ASMEND
926-
; GFX8-NEXT: v_mad_u32_u24 v22, v22, 64, s32
924+
; GFX8-NEXT: v_mad_u32_u24 v22, 16, 64, s32
927925
; GFX8-NEXT: v_lshrrev_b32_e32 v22, 6, v22
928926
; GFX8-NEXT: v_writelane_b32 v21, s59, 28
929927
; GFX8-NEXT: v_readfirstlane_b32 s59, v22

0 commit comments

Comments
 (0)