Skip to content

Commit e1472db

Browse files
committed
[GlobalISel] Implement commuting shl (add/or x, c1), c2 -> add/or (shl x, c2), c1 << c2
There's a target hook that's called in DAGCombiner that we stub here, I'll implement the equivalent override for AArch64 in a subsequent patch since it's used by different shift combine. This change by itself has minor code size improvements on arm64 -Os CTMark: Program size.__text outputg181ppyy output8av1cxfn diff consumer-typeset/consumer-typeset 410648.00 410648.00 0.0% tramp3d-v4/tramp3d-v4 364176.00 364176.00 0.0% kimwitu++/kc 449216.00 449212.00 -0.0% 7zip/7zip-benchmark 576128.00 576120.00 -0.0% sqlite3/sqlite3 285108.00 285100.00 -0.0% SPASS/SPASS 411720.00 411688.00 -0.0% ClamAV/clamscan 379868.00 379764.00 -0.0% Bullet/bullet 452064.00 451928.00 -0.0% mafft/pairlocalalign 246184.00 246108.00 -0.0% lencod/lencod 428524.00 428152.00 -0.1% Geomean difference -0.0% Differential Revision: https://reviews.llvm.org/D150086
1 parent bff9fe9 commit e1472db

File tree

6 files changed

+195
-8
lines changed

6 files changed

+195
-8
lines changed

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,8 @@ class CombinerHelper {
302302
void applyShiftOfShiftedLogic(MachineInstr &MI,
303303
ShiftOfShiftedLogic &MatchInfo);
304304

305+
bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo);
306+
305307
/// Transform a multiply by a power-of-2 value to a left shift.
306308
bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
307309
void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4034,6 +4034,19 @@ class TargetLowering : public TargetLoweringBase {
40344034
return true;
40354035
}
40364036

4037+
/// GlobalISel - return true if it is profitable to move this shift by a
4038+
/// constant amount through its operand, adjusting any immediate operands as
4039+
/// necessary to preserve semantics. This transformation may not be desirable
4040+
/// if it disrupts a particularly auspicious target-specific tree (e.g.
4041+
/// bitfield extraction in AArch64). By default, it returns true.
4042+
///
4043+
/// @param MI the shift instruction
4044+
/// @param IsAfterLegal true if running after legalization.
4045+
virtual bool isDesirableToCommuteWithShift(const MachineInstr &MI,
4046+
bool IsAfterLegal) const {
4047+
return true;
4048+
}
4049+
40374050
// Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and
40384051
// optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of
40394052
// writing this) is:

llvm/include/llvm/Target/GlobalISel/Combine.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,14 @@ def reduce_shl_of_extend : GICombineRule<
243243
[{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]),
244244
(apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>;
245245

246+
// Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
247+
// Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
248+
def commute_shift : GICombineRule<
249+
(defs root:$d, build_fn_matchinfo:$matchinfo),
250+
(match (wip_match_opcode G_SHL):$d,
251+
[{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]),
252+
(apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>;
253+
246254
def narrow_binop_feeding_and : GICombineRule<
247255
(defs root:$root, build_fn_matchinfo:$matchinfo),
248256
(match (wip_match_opcode G_AND):$root,
@@ -1097,7 +1105,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
10971105
unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shift,
10981106
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
10991107
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
1100-
div_rem_to_divrem, funnel_shift_combines,
1108+
div_rem_to_divrem, funnel_shift_combines, commute_shift,
11011109
form_bitfield_extract, constant_fold, fabs_fneg_fold,
11021110
intdiv_combines, mulh_combines, redundant_neg_operands,
11031111
and_or_disjoint_mask, fma_combines, fold_binop_into_select,

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1624,6 +1624,41 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
16241624
MI.eraseFromParent();
16251625
}
16261626

1627+
bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) {
1628+
assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected G_SHL");
1629+
// Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
1630+
// Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
1631+
auto &Shl = cast<GenericMachineInstr>(MI);
1632+
Register DstReg = Shl.getReg(0);
1633+
Register SrcReg = Shl.getReg(1);
1634+
Register ShiftReg = Shl.getReg(2);
1635+
Register X, C1;
1636+
1637+
if (!getTargetLowering().isDesirableToCommuteWithShift(MI, !isPreLegalize()))
1638+
return false;
1639+
1640+
if (!mi_match(SrcReg, MRI,
1641+
m_OneNonDBGUse(m_any_of(m_GAdd(m_Reg(X), m_Reg(C1)),
1642+
m_GOr(m_Reg(X), m_Reg(C1))))))
1643+
return false;
1644+
1645+
APInt C1Val, C2Val;
1646+
if (!mi_match(C1, MRI, m_ICstOrSplat(C1Val)) ||
1647+
!mi_match(ShiftReg, MRI, m_ICstOrSplat(C2Val)))
1648+
return false;
1649+
1650+
auto *SrcDef = MRI.getVRegDef(SrcReg);
1651+
assert(SrcDef->getOpcode() == TargetOpcode::G_ADD ||
1652+
SrcDef->getOpcode() == TargetOpcode::G_OR && "Unexpected op");
1653+
LLT SrcTy = MRI.getType(SrcReg);
1654+
MatchInfo = [=](MachineIRBuilder &B) {
1655+
auto S1 = B.buildShl(SrcTy, X, ShiftReg);
1656+
auto S2 = B.buildShl(SrcTy, C1, ShiftReg);
1657+
B.buildInstr(SrcDef->getOpcode(), {DstReg}, {S1, S2});
1658+
};
1659+
return true;
1660+
}
1661+
16271662
bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
16281663
unsigned &ShiftVal) {
16291664
assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
2+
# RUN: llc -mtriple aarch64 -mattr=+fullfp16 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
3+
---
4+
name: shl_add_k
5+
alignment: 4
6+
tracksRegLiveness: true
7+
body: |
8+
bb.1:
9+
liveins: $w1, $x0
10+
11+
; CHECK-LABEL: name: shl_add_k
12+
; CHECK: liveins: $w1, $x0
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
15+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
16+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
17+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
18+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
19+
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[C1]]
20+
; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32))
21+
; CHECK-NEXT: RET_ReallyLR
22+
%0:_(p0) = COPY $x0
23+
%1:_(s32) = COPY $w1
24+
%2:_(s32) = G_CONSTANT i32 1
25+
%4:_(s32) = G_CONSTANT i32 2
26+
%3:_(s32) = G_ADD %1, %2
27+
%5:_(s32) = G_SHL %3, %4(s32)
28+
G_STORE %5(s32), %0(p0) :: (store (s32))
29+
RET_ReallyLR
30+
31+
...
32+
---
33+
name: shl_or_k
34+
alignment: 4
35+
tracksRegLiveness: true
36+
body: |
37+
bb.1:
38+
liveins: $w1, $x0
39+
40+
; CHECK-LABEL: name: shl_or_k
41+
; CHECK: liveins: $w1, $x0
42+
; CHECK-NEXT: {{ $}}
43+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
44+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
45+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
46+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
47+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
48+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[C1]]
49+
; CHECK-NEXT: G_STORE [[OR]](s32), [[COPY]](p0) :: (store (s32))
50+
; CHECK-NEXT: RET_ReallyLR
51+
%0:_(p0) = COPY $x0
52+
%1:_(s32) = COPY $w1
53+
%2:_(s32) = G_CONSTANT i32 1
54+
%4:_(s32) = G_CONSTANT i32 2
55+
%3:_(s32) = G_OR %1, %2
56+
%5:_(s32) = G_SHL %3, %4(s32)
57+
G_STORE %5(s32), %0(p0) :: (store (s32))
58+
RET_ReallyLR
59+
60+
...
61+
---
62+
name: shl_or_k_multiuse
63+
alignment: 4
64+
tracksRegLiveness: true
65+
body: |
66+
bb.1:
67+
liveins: $w1, $x0
68+
69+
; CHECK-LABEL: name: shl_or_k_multiuse
70+
; CHECK: liveins: $w1, $x0
71+
; CHECK-NEXT: {{ $}}
72+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
73+
; CHECK-NEXT: %ptr:_(p0) = COPY $x1
74+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
75+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
76+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
77+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[C]]
78+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C1]](s32)
79+
; CHECK-NEXT: G_STORE [[SHL]](s32), [[COPY]](p0) :: (store (s32))
80+
; CHECK-NEXT: G_STORE [[OR]](s32), %ptr(p0) :: (store (s32))
81+
; CHECK-NEXT: RET_ReallyLR
82+
%0:_(p0) = COPY $x0
83+
%ptr:_(p0) = COPY $x1
84+
%1:_(s32) = COPY $w1
85+
%2:_(s32) = G_CONSTANT i32 1
86+
%4:_(s32) = G_CONSTANT i32 2
87+
%3:_(s32) = G_OR %1, %2
88+
%5:_(s32) = G_SHL %3, %4(s32)
89+
G_STORE %5(s32), %0(p0) :: (store (s32))
90+
G_STORE %3(s32), %ptr(p0) :: (store (s32))
91+
RET_ReallyLR
92+
93+
...
94+
---
95+
name: shl_add_k_vector
96+
alignment: 4
97+
tracksRegLiveness: true
98+
body: |
99+
bb.1:
100+
liveins: $w1, $x0
101+
102+
; CHECK-LABEL: name: shl_add_k_vector
103+
; CHECK: liveins: $w1, $x0
104+
; CHECK-NEXT: {{ $}}
105+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
106+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
107+
; CHECK-NEXT: %xvec:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY1]](s32), [[COPY1]](s32), [[COPY1]](s32)
108+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
109+
; CHECK-NEXT: %veccst2:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
110+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<4 x s32>) = G_SHL %xvec, %veccst2(<4 x s32>)
111+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
112+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
113+
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[SHL]], [[BUILD_VECTOR]]
114+
; CHECK-NEXT: G_STORE [[ADD]](<4 x s32>), [[COPY]](p0) :: (store (<4 x s32>))
115+
; CHECK-NEXT: RET_ReallyLR
116+
%0:_(p0) = COPY $x0
117+
%1:_(s32) = COPY $w1
118+
%xvec:_(<4 x s32>) = G_BUILD_VECTOR %1, %1, %1, %1
119+
%2:_(s32) = G_CONSTANT i32 1
120+
%veccst:_(<4 x s32>) = G_BUILD_VECTOR %2, %2, %2, %2
121+
%4:_(s32) = G_CONSTANT i32 2
122+
%veccst2:_(<4 x s32>) = G_BUILD_VECTOR %4, %4, %4, %4
123+
%3:_(<4 x s32>) = G_ADD %xvec, %veccst2
124+
%5:_(<4 x s32>) = G_SHL %3, %veccst2
125+
G_STORE %5(<4 x s32>), %0(p0) :: (store (<4 x s32>))
126+
RET_ReallyLR
127+
128+
...

llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,19 @@ define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) {
101101
define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
102102
; VI-LABEL: add_shl_vgpr_const_inline_const:
103103
; VI: ; %bb.0:
104-
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x3f4, v0
105104
; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
105+
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7e800, v0
106106
; VI-NEXT: ; return to shader part epilog
107107
;
108108
; GFX9-LABEL: add_shl_vgpr_const_inline_const:
109109
; GFX9: ; %bb.0:
110-
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f4
111-
; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, 9
110+
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e800
111+
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1
112112
; GFX9-NEXT: ; return to shader part epilog
113113
;
114114
; GFX10-LABEL: add_shl_vgpr_const_inline_const:
115115
; GFX10: ; %bb.0:
116-
; GFX10-NEXT: v_add_lshl_u32 v0, 0x3f4, v0, 9
116+
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x7e800
117117
; GFX10-NEXT: ; return to shader part epilog
118118
%x = add i32 %a, 1012
119119
%result = shl i32 %x, 9
@@ -124,18 +124,19 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
124124
define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) {
125125
; VI-LABEL: add_shl_vgpr_inline_const_x2:
126126
; VI: ; %bb.0:
127-
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
128127
; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
128+
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x600, v0
129129
; VI-NEXT: ; return to shader part epilog
130130
;
131131
; GFX9-LABEL: add_shl_vgpr_inline_const_x2:
132132
; GFX9: ; %bb.0:
133-
; GFX9-NEXT: v_add_lshl_u32 v0, v0, 3, 9
133+
; GFX9-NEXT: v_mov_b32_e32 v1, 0x600
134+
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1
134135
; GFX9-NEXT: ; return to shader part epilog
135136
;
136137
; GFX10-LABEL: add_shl_vgpr_inline_const_x2:
137138
; GFX10: ; %bb.0:
138-
; GFX10-NEXT: v_add_lshl_u32 v0, v0, 3, 9
139+
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x600
139140
; GFX10-NEXT: ; return to shader part epilog
140141
%x = add i32 %a, 3
141142
%result = shl i32 %x, 9

0 commit comments

Comments
 (0)