Skip to content

[AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15), 0x10001), 0xffff) to CMLTz #92915

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule <
(apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
>;

// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
def combine_mul_cmlt : GICombineRule<
(defs root:$root, register_matchinfo:$matchinfo),
(match (wip_match_opcode G_MUL):$root,
[{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
>;

// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
Expand Down Expand Up @@ -296,5 +304,6 @@ def AArch64PostLegalizerCombiner
split_store_zero_128, undef_combines,
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs,
push_freeze_to_prevent_poison_from_propagating]> {
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt]> {
}
55 changes: 55 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}

// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
Register &SrcReg) {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());

if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
DstTy != LLT::fixed_vector(8, 16))
return false;

auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
if (AndMI->getOpcode() != TargetOpcode::G_AND)
return false;
auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
return false;

// Check the constant splat values
auto V1 = isConstantOrConstantSplatVector(
*MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
auto V2 = isConstantOrConstantSplatVector(
*MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
auto V3 = isConstantOrConstantSplatVector(
*MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
if (!V1.has_value() || !V2.has_value() || !V3.has_value())
return false;
unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
V3 != (HalfSize - 1))
return false;

SrcReg = LShrMI->getOperand(1).getReg();

return true;
}

void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, Register &SrcReg) {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
LLT HalfTy =
DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
.changeElementSize(DstTy.getScalarSizeInBits() / 2);

Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
Register CastReg =
B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
Register CMLTReg =
B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
.getReg(0);

B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
MI.eraseFromParent();
}

class AArch64PostLegalizerCombinerImpl : public Combiner {
protected:
// TODO: Make CombinerHelper methods const.
Expand Down
114 changes: 25 additions & 89 deletions llvm/test/CodeGen/AArch64/mulcmle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,130 +24,66 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
}

define <2 x i64> @v2i64(<2 x i64> %a) {
; CHECK-SD-LABEL: v2i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.4s, #1
; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31
; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: fmov x11, d2
; CHECK-GI-NEXT: mov x9, v2.d[1]
; CHECK-GI-NEXT: fmov x10, d0
; CHECK-GI-NEXT: mov x8, v0.d[1]
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: fmov d0, x10
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: ret
%b = lshr <2 x i64> %a, <i64 31, i64 31>
%c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
%d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
ret <2 x i64> %d
}

define <2 x i32> @v2i32(<2 x i32> %a) {
; CHECK-SD-LABEL: v2i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.4h, #1
; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15
; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: ret
%b = lshr <2 x i32> %a, <i32 15, i32 15>
%c = and <2 x i32> %b, <i32 65537, i32 65537>
%d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
ret <2 x i32> %d
}

define <4 x i32> @v4i32(<4 x i32> %a) {
; CHECK-SD-LABEL: v4i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v4i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.8h, #1
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-NEXT: ret
%b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
%c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
%d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
ret <4 x i32> %d
}

define <8 x i32> @v8i32(<8 x i32> %a) {
; CHECK-SD-LABEL: v8i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v8i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v2.8h, #1
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15
; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s
; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-NEXT: ret
%b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
%c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
%d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
ret <8 x i32> %d
}

define <4 x i16> @v4i16(<4 x i16> %a) {
; CHECK-SD-LABEL: v4i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v4i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.8b, #1
; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-NEXT: ret
%b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
%c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
%d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
ret <4 x i16> %d
}

define <8 x i16> @v8i16(<8 x i16> %a) {
; CHECK-SD-LABEL: v8i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v8i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.16b, #1
; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7
; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: ret
%b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
%d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
Expand Down
Loading