Skip to content

AMDGPU: Do not try to commute instruction with same input register #127562

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,11 +682,21 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
if (!CanCommute)
return false;

MachineOperand &Op = MI->getOperand(OpNo);
MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);

// One of operands might be an Imm operand, and OpNo may refer to it after
// the call of commuteInstruction() below. Such situations are avoided
// here explicitly as OpNo must be a register operand to be a candidate
// for memory folding.
if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
if (!Op.isReg() || !CommutedOp.isReg())
return false;

// The same situation with an immediate could reproduce if both inputs are
// the same register.
if (Op.isReg() && CommutedOp.isReg() &&
(Op.getReg() == CommutedOp.getReg() &&
Op.getSubReg() == CommutedOp.getSubReg()))
return false;

if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/dag-divergence.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f
; GCN-LABEL: {{^}}wide_carry_divergence_error:
; GCN: v_sub_u32_e32
; GCN: v_subb_u32_e32
; GCN: v_subbrev_u32_e32
; GCN: v_subbrev_u32_e32
; GCN: v_subb_u32_e32
; GCN: v_subb_u32_e32
define <2 x i128> @wide_carry_divergence_error(i128 %arg) {
%i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false)
%i1 = sub i128 0, %i
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/div_i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc
; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc
; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v5, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v18, v16
Expand Down Expand Up @@ -2355,8 +2355,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v8, v9
; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v13, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_subbrev_co_u32_e32 v14, vcc, 0, v8, vcc
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v8, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
; GFX9-NEXT: v_or_b32_e32 v10, v13, v15
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/div_v2i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2
; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v18, vcc
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v18, vcc
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
; SDAG-NEXT: v_or_b32_e32 v9, v3, v11
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
Expand Down Expand Up @@ -263,10 +263,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc
; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4
; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v8, vcc
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v9, v10
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
Expand Down Expand Up @@ -872,10 +872,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc
; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22
; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23]
; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
; SDAG-NEXT: v_or_b32_e32 v17, v23, v25
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25]
Expand Down Expand Up @@ -1047,10 +1047,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc
; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v24, vcc
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v24, vcc
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc
; SDAG-NEXT: v_or_b32_e32 v2, v2, v20
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
Expand Down Expand Up @@ -1619,10 +1619,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
; SDAG-NEXT: v_or_b32_e32 v9, v11, v19
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
Expand Down Expand Up @@ -1814,10 +1814,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc
; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10
; SDAG-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v18, vcc
; SDAG-NEXT: v_subb_u32_e32 v12, vcc, 0, v18, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v18, vcc
; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v18, vcc
; SDAG-NEXT: v_or_b32_e32 v14, v14, v12
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
Expand Down Expand Up @@ -2502,10 +2502,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc
; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
; SDAG-NEXT: v_or_b32_e32 v17, v19, v21
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
Expand Down Expand Up @@ -2677,10 +2677,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16
; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/rem_i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v11, vcc
; GFX9-NEXT: v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc
; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
; GFX9-NEXT: v_or_b32_e32 v12, v7, v9
Expand Down Expand Up @@ -1541,8 +1541,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v9
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v12, vcc
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc
; GFX9-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v11, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v11, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[V_SUBBREV_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBBREV_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
; CHECK-NEXT: $vgpr4 = COPY [[V_SUBBREV_U32_e32_]]
; CHECK-NEXT: [[V_SUBB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBB_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
; CHECK-NEXT: $vgpr4 = COPY [[V_SUBB_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr4
%0:vgpr_32 = COPY $vgpr0
%1:sreg_64 = S_MOV_B64 0
Expand Down
Loading