Skip to content

Commit 2652db4

Browse files
Yashwant SinghYashwant Singh
authored andcommitted
Handling ADD|SUB U64 decomposed Pseudos not getting lowered to SDWA form
This patch fixes some of the V_ADD/SUB_U64_PSEUDO not getting converted to their sdwa form. We still get below patterns in generated code: v_and_b32_e32 v0, 0xff, v0 v_add_co_u32_e32 v0, vcc, v1, v0 v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc and, v_and_b32_e32 v2, 0xff, v2 v_add_co_u32_e32 v0, vcc, v0, v2 v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1st and 2nd instructions of both above examples should have been folded into sdwa add with BYTE_0 src operand. The reason being the pseudo instruction is broken down into VOP3 instruction pair of V_ADD_CO_U32_e64 and V_ADDC_U32_e64. The sdwa pass attempts lowering them to their VOP2 form before converting them into sdwa instructions. However V_ADDC_U32_e64 cannot be shrunk to it's VOP2 form if it has non-reg src1 operand. This change attempts to fix that problem by only shrinking V_ADD_CO_U32_e64 instruction. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D136663
1 parent cc07185 commit 2652db4

File tree

4 files changed

+21
-39
lines changed

4 files changed

+21
-39
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -832,9 +832,9 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
832832
}
833833
}
834834

835-
// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
836-
// V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
837-
// to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
835+
// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
836+
// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
837+
// V_ADD_CO_U32_sdwa.
838838
//
839839
// We are transforming from a VOP3 into a VOP2 form of the instruction.
840840
// %19:vgpr_32 = V_AND_B32_e32 255,
@@ -848,8 +848,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
848848
// %47:vgpr_32 = V_ADD_CO_U32_sdwa
849849
// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
850850
// implicit-def $vcc, implicit $exec
851-
// %48:vgpr_32 = V_ADDC_U32_e32
852-
// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
851+
// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
852+
// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
853853
void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
854854
const GCNSubtarget &ST) const {
855855
int Opc = MI.getOpcode();
@@ -868,10 +868,7 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
868868
if (!NextOp)
869869
return;
870870
MachineInstr &MISucc = *NextOp->getParent();
871-
// Can the successor be shrunk?
872-
if (!TII->canShrink(MISucc, *MRI))
873-
return;
874-
int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
871+
875872
// Make sure the carry in/out are subsequently unused.
876873
MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
877874
if (!CarryIn)
@@ -893,7 +890,6 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
893890
return;
894891
}
895892

896-
// Make the two new e32 instruction variants.
897893
// Replace MI with V_{SUB|ADD}_I32_e32
898894
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
899895
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
@@ -903,14 +899,9 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
903899

904900
MI.eraseFromParent();
905901

906-
// Replace MISucc with V_{SUBB|ADDC}_U32_e32
907-
BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
908-
.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
909-
.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
910-
.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
911-
.setMIFlags(MISucc.getFlags());
902+
// Since the carry output of MI is now VCC, update its use in MISucc.
912903

913-
MISucc.eraseFromParent();
904+
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
914905
}
915906

916907
bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,

llvm/test/CodeGen/AMDGPU/sdwa-ops.mir

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
# test for 3 consecutive _sdwa's
55
# GFX9-LABEL: name: test1_add_co_sdwa
66
# GFX9: = nsw V_ADD_CO_U32_sdwa
7-
# GFX9-NEXT: = nuw V_ADDC_U32_e32
7+
# GFX9-NEXT: = nuw V_ADDC_U32_e64
88
# GFX9: V_ADD_CO_U32_sdwa
9-
# GFX9-NEXT: V_ADDC_U32_e32
9+
# GFX9-NEXT: V_ADDC_U32_e64
1010
# GFX9: V_ADD_CO_U32_sdwa
11-
# GFX9-NEXT: V_ADDC_U32_e32
11+
# GFX9-NEXT: V_ADDC_U32_e64
1212
---
1313
name: test1_add_co_sdwa
1414
tracksRegLiveness: true
@@ -48,7 +48,7 @@ body: |
4848
# test for VCC interference on sdwa, should generate 1 xform only
4949
# GFX9-LABEL: name: test2_add_co_sdwa
5050
# GFX9: V_ADD_CO_U32_sdwa
51-
# GFX9: V_ADDC_U32_e32
51+
# GFX9: V_ADDC_U32_e64
5252
# GFX9-NOT: V_ADD_CO_U32_sdwa
5353
# GFX9-NOT: V_ADDC_U32_e32
5454
---
@@ -151,7 +151,7 @@ body: |
151151
# test for simple example, should generate sdwa
152152
# GFX9-LABEL: name: test5_add_co_sdwa
153153
# GFX9: V_ADD_CO_U32_sdwa
154-
# GFX9: V_ADDC_U32_e32
154+
# GFX9: V_ADDC_U32_e64
155155
---
156156
name: test5_add_co_sdwa
157157
tracksRegLiveness: true
@@ -388,4 +388,3 @@ body: |
388388
%64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
389389
%62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
390390
GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))
391-

llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ define amdgpu_kernel void @sdwa_test() local_unnamed_addr #0 {
55
; GFX9: ; %bb.0: ; %bb
66
; GFX9-NEXT: v_add_u32_e32 v1, 10, v0
77
; GFX9-NEXT: v_add_u32_e32 v0, 20, v0
8-
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
9-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
8+
; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
109
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
1110
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1211
; GFX9-NEXT: s_endpgm
@@ -27,16 +26,13 @@ define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspac
2726
; GFX9-LABEL: test_add_co_sdwa:
2827
; GFX9: ; %bb.0: ; %bb
2928
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
30-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
29+
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3130
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
3231
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
33-
; GFX9-NEXT: global_load_dword v2, v1, s[2:3]
34-
; GFX9-NEXT: s_nop 0
32+
; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
3533
; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
36-
; GFX9-NEXT: s_waitcnt vmcnt(1)
37-
; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
3834
; GFX9-NEXT: s_waitcnt vmcnt(0)
39-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
35+
; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
4036
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4137
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
4238
; GFX9-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ define amdgpu_kernel void @sdwa_test_sub() local_unnamed_addr #0 {
55
; GFX9: ; %bb.0: ; %bb
66
; GFX9-NEXT: v_add_u32_e32 v1, 10, v0
77
; GFX9-NEXT: v_add_u32_e32 v0, 20, v0
8-
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
9-
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v1, v0
8+
; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
109
; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], 0, 0, vcc
1110
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1211
; GFX9-NEXT: s_endpgm
@@ -27,16 +26,13 @@ define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspac
2726
; GFX9-LABEL: test_sub_co_sdwa:
2827
; GFX9: ; %bb.0: ; %bb
2928
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
30-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
29+
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3130
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
3231
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
33-
; GFX9-NEXT: global_load_dword v2, v1, s[2:3]
34-
; GFX9-NEXT: s_nop 0
32+
; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
3533
; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
36-
; GFX9-NEXT: s_waitcnt vmcnt(1)
37-
; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
3834
; GFX9-NEXT: s_waitcnt vmcnt(0)
39-
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
35+
; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
4036
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
4137
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
4238
; GFX9-NEXT: s_endpgm

0 commit comments

Comments
 (0)