Skip to content

Commit d858a53

Browse files
committed
[AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15), 0x10001), 0xffff) to CMLTz
1 parent 69a7174 commit d858a53

File tree

3 files changed

+89
-92
lines changed

3 files changed

+89
-92
lines changed

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule <
265265
(apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
266266
>;
267267

268+
// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
269+
def combine_mul_cmlt : GICombineRule<
270+
(defs root:$root, register_matchinfo:$matchinfo),
271+
(match (wip_match_opcode G_MUL):$root,
272+
[{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
273+
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
274+
>;
275+
268276
// Post-legalization combines which should happen at all optimization levels.
269277
// (E.g. ones that facilitate matching for the selector) For example, matching
270278
// pseudos.
@@ -295,5 +303,6 @@ def AArch64PostLegalizerCombiner
295303
ptr_add_immed_chain, overlapping_and,
296304
split_store_zero_128, undef_combines,
297305
select_to_minmax, or_to_bsp, combine_concat_vector,
298-
commute_constant_to_rhs]> {
306+
commute_constant_to_rhs,
307+
combine_mul_cmlt]> {
299308
}

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,60 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
381381
MI.eraseFromParent();
382382
}
383383

384+
bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
385+
Register &SrcReg) {
386+
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
387+
388+
if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
389+
DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
390+
DstTy != LLT::fixed_vector(8, 16))
391+
return false;
392+
393+
auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
394+
if (AndMI->getOpcode() != TargetOpcode::G_AND)
395+
return false;
396+
auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
397+
if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
398+
return false;
399+
400+
// Check the constant splat values
401+
auto V1 = isConstantOrConstantSplatVector(
402+
*MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
403+
auto V2 = isConstantOrConstantSplatVector(
404+
*MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
405+
auto V3 = isConstantOrConstantSplatVector(
406+
*MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
407+
if (!V1.has_value() || !V2.has_value() || !V3.has_value())
408+
return false;
409+
unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
410+
if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
411+
V3 != (HalfSize - 1))
412+
return false;
413+
414+
SrcReg = LShrMI->getOperand(1).getReg();
415+
416+
return true;
417+
}
418+
419+
void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
420+
MachineIRBuilder &B, Register &SrcReg) {
421+
Register DstReg = MI.getOperand(0).getReg();
422+
LLT DstTy = MRI.getType(DstReg);
423+
LLT HalfTy =
424+
DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
425+
.changeElementSize(DstTy.getScalarSizeInBits() / 2);
426+
427+
Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
428+
Register CastReg =
429+
B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
430+
Register CMLTReg =
431+
B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
432+
.getReg(0);
433+
434+
B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
435+
MI.eraseFromParent();
436+
}
437+
384438
class AArch64PostLegalizerCombinerImpl : public Combiner {
385439
protected:
386440
// TODO: Make CombinerHelper methods const.

llvm/test/CodeGen/AArch64/mulcmle.ll

Lines changed: 25 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -24,132 +24,66 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
2424
}
2525

2626
define <2 x i64> @v2i64(<2 x i64> %a) {
27-
; CHECK-SD-LABEL: v2i64:
28-
; CHECK-SD: // %bb.0:
29-
; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0
30-
; CHECK-SD-NEXT: ret
31-
;
32-
; CHECK-GI-LABEL: v2i64:
33-
; CHECK-GI: // %bb.0:
34-
; CHECK-GI-NEXT: movi v1.4s, #1
35-
; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31
36-
; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff
37-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
38-
; CHECK-GI-NEXT: mov d3, v2.d[1]
39-
; CHECK-GI-NEXT: fmov x9, d2
40-
; CHECK-GI-NEXT: mov d1, v0.d[1]
41-
; CHECK-GI-NEXT: fmov x8, d0
42-
; CHECK-GI-NEXT: fmov x10, d3
43-
; CHECK-GI-NEXT: mul x8, x8, x9
44-
; CHECK-GI-NEXT: fmov x9, d1
45-
; CHECK-GI-NEXT: mul x9, x9, x10
46-
; CHECK-GI-NEXT: fmov d0, x8
47-
; CHECK-GI-NEXT: mov v0.d[1], x9
48-
; CHECK-GI-NEXT: ret
27+
; CHECK-LABEL: v2i64:
28+
; CHECK: // %bb.0:
29+
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
30+
; CHECK-NEXT: ret
4931
%b = lshr <2 x i64> %a, <i64 31, i64 31>
5032
%c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
5133
%d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
5234
ret <2 x i64> %d
5335
}
5436

5537
define <2 x i32> @v2i32(<2 x i32> %a) {
56-
; CHECK-SD-LABEL: v2i32:
57-
; CHECK-SD: // %bb.0:
58-
; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0
59-
; CHECK-SD-NEXT: ret
60-
;
61-
; CHECK-GI-LABEL: v2i32:
62-
; CHECK-GI: // %bb.0:
63-
; CHECK-GI-NEXT: movi v1.4h, #1
64-
; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15
65-
; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff
66-
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
67-
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
68-
; CHECK-GI-NEXT: ret
38+
; CHECK-LABEL: v2i32:
39+
; CHECK: // %bb.0:
40+
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
41+
; CHECK-NEXT: ret
6942
%b = lshr <2 x i32> %a, <i32 15, i32 15>
7043
%c = and <2 x i32> %b, <i32 65537, i32 65537>
7144
%d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
7245
ret <2 x i32> %d
7346
}
7447

7548
define <4 x i32> @v4i32(<4 x i32> %a) {
76-
; CHECK-SD-LABEL: v4i32:
77-
; CHECK-SD: // %bb.0:
78-
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
79-
; CHECK-SD-NEXT: ret
80-
;
81-
; CHECK-GI-LABEL: v4i32:
82-
; CHECK-GI: // %bb.0:
83-
; CHECK-GI-NEXT: movi v1.8h, #1
84-
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
85-
; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
86-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
87-
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s
88-
; CHECK-GI-NEXT: ret
49+
; CHECK-LABEL: v4i32:
50+
; CHECK: // %bb.0:
51+
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
52+
; CHECK-NEXT: ret
8953
%b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
9054
%c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
9155
%d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
9256
ret <4 x i32> %d
9357
}
9458

9559
define <8 x i32> @v8i32(<8 x i32> %a) {
96-
; CHECK-SD-LABEL: v8i32:
97-
; CHECK-SD: // %bb.0:
98-
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
99-
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
100-
; CHECK-SD-NEXT: ret
101-
;
102-
; CHECK-GI-LABEL: v8i32:
103-
; CHECK-GI: // %bb.0:
104-
; CHECK-GI-NEXT: movi v2.8h, #1
105-
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
106-
; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15
107-
; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
108-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
109-
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
110-
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s
111-
; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s
112-
; CHECK-GI-NEXT: ret
60+
; CHECK-LABEL: v8i32:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
63+
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
64+
; CHECK-NEXT: ret
11365
%b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
11466
%c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
11567
%d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
11668
ret <8 x i32> %d
11769
}
11870

11971
define <4 x i16> @v4i16(<4 x i16> %a) {
120-
; CHECK-SD-LABEL: v4i16:
121-
; CHECK-SD: // %bb.0:
122-
; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0
123-
; CHECK-SD-NEXT: ret
124-
;
125-
; CHECK-GI-LABEL: v4i16:
126-
; CHECK-GI: // %bb.0:
127-
; CHECK-GI-NEXT: movi v1.8b, #1
128-
; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7
129-
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
130-
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
131-
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h
132-
; CHECK-GI-NEXT: ret
72+
; CHECK-LABEL: v4i16:
73+
; CHECK: // %bb.0:
74+
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
75+
; CHECK-NEXT: ret
13376
%b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
13477
%c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
13578
%d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
13679
ret <4 x i16> %d
13780
}
13881

13982
define <8 x i16> @v8i16(<8 x i16> %a) {
140-
; CHECK-SD-LABEL: v8i16:
141-
; CHECK-SD: // %bb.0:
142-
; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
143-
; CHECK-SD-NEXT: ret
144-
;
145-
; CHECK-GI-LABEL: v8i16:
146-
; CHECK-GI: // %bb.0:
147-
; CHECK-GI-NEXT: movi v1.16b, #1
148-
; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7
149-
; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
150-
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
151-
; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h
152-
; CHECK-GI-NEXT: ret
83+
; CHECK-LABEL: v8i16:
84+
; CHECK: // %bb.0:
85+
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
86+
; CHECK-NEXT: ret
15387
%b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
15488
%c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
15589
%d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>

0 commit comments

Comments
 (0)