Skip to content

[AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant #71035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
let HasExtVOP3DPP = 0;
}

def IsPow2Plus1: PatLeaf<(i32 imm), [{
uint32_t V = N->getZExtValue();
return isPowerOf2_32(V - 1);
}]>;

def Log2_32: SDNodeXForm<imm, [{
uint32_t V = N->getZExtValue();
return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32);
}]>;

let SubtargetPredicate = isGFX9Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
Expand Down Expand Up @@ -612,6 +622,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;

def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;

let SubtargetPredicate = isGFX940Plus in
def : GCNPat<
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/CodeGen/AMDGPU/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2395,6 +2395,45 @@ entry:
ret void
}

define i32 @mul_pow2_plus_1(i32 %val) {
; SI-LABEL: mul_pow2_plus_1:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mul_lo_u32 v0, v0, 9
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_pow2_plus_1:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mul_lo_u32 v0, v0, 9
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: mul_pow2_plus_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: mul_pow2_plus_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: mul_pow2_plus_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: mul_pow2_plus_1:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
%mul = mul i32 %val, 9
ret i32 %mul
}

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: bb.2.Flow:
; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; SI-NEXT: {{ $}}
; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
; SI-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %31:vgpr_32, %bb.1, %10, %bb.4
; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4
; SI-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %34:vgpr_32, %bb.4
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI-NEXT: S_BRANCH %bb.3
; SI-NEXT: {{ $}}
Expand All @@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: successors: %bb.2(0x80000000)
; SI-NEXT: {{ $}}
; SI-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
; SI-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
; SI-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
; SI-NEXT: S_BRANCH %bb.2
; SI-NEXT: {{ $}}
; SI-NEXT: bb.5.if.end:
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
Original file line number Diff line number Diff line change
Expand Up @@ -92,32 +92,32 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: .LBB2_1: ; %if.end
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2
; SI-NEXT: v_add_nc_u32_e32 v2, 1, v0
; SI-NEXT: v_add_nc_u32_e32 v2, 1, v3
; SI-NEXT: s_add_i32 s1, s1, 1
; SI-NEXT: s_cmp_lt_i32 s1, s0
; SI-NEXT: s_cbranch_scc0 .LBB2_6
; SI-NEXT: .LBB2_2: ; %for.body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b32 s2, vcc_lo
; SI-NEXT: s_xor_b32 s2, exec_lo, s2
; SI-NEXT: ; %bb.3: ; %else
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
; SI-NEXT: v_mul_lo_u32 v0, v2, 3
; SI-NEXT: v_mul_f32_e32 v3, v1, v2
; SI-NEXT: v_mul_f32_e32 v0, v1, v2
; SI-NEXT: v_lshl_add_u32 v3, v2, 1, v2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; %bb.4: ; %Flow
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
; SI-NEXT: s_andn2_saveexec_b32 s2, s2
; SI-NEXT: s_cbranch_execz .LBB2_1
; SI-NEXT: ; %bb.5: ; %if
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
; SI-NEXT: v_mul_f32_e32 v3, s1, v1
; SI-NEXT: v_add_nc_u32_e32 v0, 1, v2
; SI-NEXT: v_mul_f32_e32 v0, s1, v1
; SI-NEXT: v_add_nc_u32_e32 v3, 1, v2
; SI-NEXT: s_branch .LBB2_1
; SI-NEXT: .LBB2_6: ; %for.end
; SI-NEXT: v_add_f32_e32 v0, v0, v3
; SI-NEXT: v_add_f32_e32 v0, v3, v0
; SI-NEXT: ; return to shader part epilog
entry:
; %break = icmp sgt i32 %bound, 0
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/wqm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
; GFX9-W64-NEXT: ; %bb.2: ; %Flow
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
; GFX9-W64-NEXT: ; %bb.3: ; %IF
; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3
; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5
; GFX9-W64-NEXT: ; %bb.4: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
Expand Down Expand Up @@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
; GFX10-W32-NEXT: ; %bb.2: ; %Flow
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
; GFX10-W32-NEXT: ; %bb.3: ; %IF
; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3
; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5
; GFX10-W32-NEXT: ; %bb.4: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
Expand Down