Skip to content

Commit 2cf47f0

Browse files
committed
[AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15), 0x10001), 0xffff) to CMLTz
This patch mirrors the following SelectionDAG patch for GlobalISel: https://reviews.llvm.org/D130874
1 parent a1a8f6f commit 2cf47f0

File tree

3 files changed

+90
-92
lines changed

3 files changed

+90
-92
lines changed

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule <
265265
(apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
266266
>;
267267

268+
// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
269+
def combine_mul_cmlt : GICombineRule<
270+
(defs root:$root, register_matchinfo:$matchinfo),
271+
(match (wip_match_opcode G_MUL):$root,
272+
[{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
273+
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
274+
>;
275+
268276
// Post-legalization combines which should happen at all optimization levels.
269277
// (E.g. ones that facilitate matching for the selector) For example, matching
270278
// pseudos.
@@ -296,5 +304,6 @@ def AArch64PostLegalizerCombiner
296304
split_store_zero_128, undef_combines,
297305
select_to_minmax, or_to_bsp, combine_concat_vector,
298306
commute_constant_to_rhs,
299-
push_freeze_to_prevent_poison_from_propagating]> {
307+
push_freeze_to_prevent_poison_from_propagating,
308+
combine_mul_cmlt]> {
300309
}

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
381381
MI.eraseFromParent();
382382
}
383383

384+
// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
385+
bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
386+
Register &SrcReg) {
387+
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
388+
389+
if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
390+
DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
391+
DstTy != LLT::fixed_vector(8, 16))
392+
return false;
393+
394+
auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
395+
if (AndMI->getOpcode() != TargetOpcode::G_AND)
396+
return false;
397+
auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
398+
if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
399+
return false;
400+
401+
// Check the constant splat values
402+
auto V1 = isConstantOrConstantSplatVector(
403+
*MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
404+
auto V2 = isConstantOrConstantSplatVector(
405+
*MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
406+
auto V3 = isConstantOrConstantSplatVector(
407+
*MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
408+
if (!V1.has_value() || !V2.has_value() || !V3.has_value())
409+
return false;
410+
unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
411+
if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
412+
V3 != (HalfSize - 1))
413+
return false;
414+
415+
SrcReg = LShrMI->getOperand(1).getReg();
416+
417+
return true;
418+
}
419+
420+
void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
421+
MachineIRBuilder &B, Register &SrcReg) {
422+
Register DstReg = MI.getOperand(0).getReg();
423+
LLT DstTy = MRI.getType(DstReg);
424+
LLT HalfTy =
425+
DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
426+
.changeElementSize(DstTy.getScalarSizeInBits() / 2);
427+
428+
Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
429+
Register CastReg =
430+
B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
431+
Register CMLTReg =
432+
B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
433+
.getReg(0);
434+
435+
B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
436+
MI.eraseFromParent();
437+
}
438+
384439
class AArch64PostLegalizerCombinerImpl : public Combiner {
385440
protected:
386441
// TODO: Make CombinerHelper methods const.

llvm/test/CodeGen/AArch64/mulcmle.ll

Lines changed: 25 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -24,132 +24,66 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
2424
}
2525

2626
define <2 x i64> @v2i64(<2 x i64> %a) {
27-
; CHECK-SD-LABEL: v2i64:
28-
; CHECK-SD: // %bb.0:
29-
; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0
30-
; CHECK-SD-NEXT: ret
31-
;
32-
; CHECK-GI-LABEL: v2i64:
33-
; CHECK-GI: // %bb.0:
34-
; CHECK-GI-NEXT: movi v1.4s, #1
35-
; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31
36-
; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff
37-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
38-
; CHECK-GI-NEXT: mov d3, v2.d[1]
39-
; CHECK-GI-NEXT: fmov x9, d2
40-
; CHECK-GI-NEXT: mov d1, v0.d[1]
41-
; CHECK-GI-NEXT: fmov x8, d0
42-
; CHECK-GI-NEXT: fmov x10, d3
43-
; CHECK-GI-NEXT: mul x8, x8, x9
44-
; CHECK-GI-NEXT: fmov x9, d1
45-
; CHECK-GI-NEXT: mul x9, x9, x10
46-
; CHECK-GI-NEXT: fmov d0, x8
47-
; CHECK-GI-NEXT: mov v0.d[1], x9
48-
; CHECK-GI-NEXT: ret
27+
; CHECK-LABEL: v2i64:
28+
; CHECK: // %bb.0:
29+
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
30+
; CHECK-NEXT: ret
4931
%b = lshr <2 x i64> %a, <i64 31, i64 31>
5032
%c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
5133
%d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
5234
ret <2 x i64> %d
5335
}
5436

5537
define <2 x i32> @v2i32(<2 x i32> %a) {
56-
; CHECK-SD-LABEL: v2i32:
57-
; CHECK-SD: // %bb.0:
58-
; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0
59-
; CHECK-SD-NEXT: ret
60-
;
61-
; CHECK-GI-LABEL: v2i32:
62-
; CHECK-GI: // %bb.0:
63-
; CHECK-GI-NEXT: movi v1.4h, #1
64-
; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15
65-
; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff
66-
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
67-
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
68-
; CHECK-GI-NEXT: ret
38+
; CHECK-LABEL: v2i32:
39+
; CHECK: // %bb.0:
40+
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
41+
; CHECK-NEXT: ret
6942
%b = lshr <2 x i32> %a, <i32 15, i32 15>
7043
%c = and <2 x i32> %b, <i32 65537, i32 65537>
7144
%d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
7245
ret <2 x i32> %d
7346
}
7447

7548
define <4 x i32> @v4i32(<4 x i32> %a) {
76-
; CHECK-SD-LABEL: v4i32:
77-
; CHECK-SD: // %bb.0:
78-
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
79-
; CHECK-SD-NEXT: ret
80-
;
81-
; CHECK-GI-LABEL: v4i32:
82-
; CHECK-GI: // %bb.0:
83-
; CHECK-GI-NEXT: movi v1.8h, #1
84-
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
85-
; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
86-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
87-
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s
88-
; CHECK-GI-NEXT: ret
49+
; CHECK-LABEL: v4i32:
50+
; CHECK: // %bb.0:
51+
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
52+
; CHECK-NEXT: ret
8953
%b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
9054
%c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
9155
%d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
9256
ret <4 x i32> %d
9357
}
9458

9559
define <8 x i32> @v8i32(<8 x i32> %a) {
96-
; CHECK-SD-LABEL: v8i32:
97-
; CHECK-SD: // %bb.0:
98-
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
99-
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
100-
; CHECK-SD-NEXT: ret
101-
;
102-
; CHECK-GI-LABEL: v8i32:
103-
; CHECK-GI: // %bb.0:
104-
; CHECK-GI-NEXT: movi v2.8h, #1
105-
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
106-
; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15
107-
; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
108-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
109-
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
110-
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s
111-
; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s
112-
; CHECK-GI-NEXT: ret
60+
; CHECK-LABEL: v8i32:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
63+
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
64+
; CHECK-NEXT: ret
11365
%b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
11466
%c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
11567
%d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
11668
ret <8 x i32> %d
11769
}
11870

11971
define <4 x i16> @v4i16(<4 x i16> %a) {
120-
; CHECK-SD-LABEL: v4i16:
121-
; CHECK-SD: // %bb.0:
122-
; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0
123-
; CHECK-SD-NEXT: ret
124-
;
125-
; CHECK-GI-LABEL: v4i16:
126-
; CHECK-GI: // %bb.0:
127-
; CHECK-GI-NEXT: movi v1.8b, #1
128-
; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7
129-
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
130-
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
131-
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h
132-
; CHECK-GI-NEXT: ret
72+
; CHECK-LABEL: v4i16:
73+
; CHECK: // %bb.0:
74+
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
75+
; CHECK-NEXT: ret
13376
%b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
13477
%c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
13578
%d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
13679
ret <4 x i16> %d
13780
}
13881

13982
define <8 x i16> @v8i16(<8 x i16> %a) {
140-
; CHECK-SD-LABEL: v8i16:
141-
; CHECK-SD: // %bb.0:
142-
; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
143-
; CHECK-SD-NEXT: ret
144-
;
145-
; CHECK-GI-LABEL: v8i16:
146-
; CHECK-GI: // %bb.0:
147-
; CHECK-GI-NEXT: movi v1.16b, #1
148-
; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7
149-
; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
150-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
151-
; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h
152-
; CHECK-GI-NEXT: ret
83+
; CHECK-LABEL: v8i16:
84+
; CHECK: // %bb.0:
85+
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
86+
; CHECK-NEXT: ret
15387
%b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
15488
%c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
15589
%d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>

0 commit comments

Comments
 (0)