Skip to content

[LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. #134116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 82 additions & 41 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1112,6 +1112,19 @@ struct SVEIntrinsicInfo {
return *this;
}

bool hasMatchingIROpode() const { return IROpcode != 0; }
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an existing comment above that hopefully gives enough information but please shout if you feel something more bespoke is required.


unsigned getMatchingIROpode() const {
assert(hasMatchingIROpode() && "Propery not set!");
return IROpcode;
}

SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
assert(!hasMatchingIROpode() && "Cannot set property twice!");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'm not sure I see the value of this assert, as I could imagine a use-case for someone wanting override a previously set opcode?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I consider that a bug. The intent is for an intrinsic's SVEIntrinsicInfo to be constant and I have a hope to one day make this static rather than having to recompute it as is done today.

IROpcode = Opcode;
return *this;
}

//
// Properties relating to the result of inactive lanes.
//
Expand Down Expand Up @@ -1187,6 +1200,7 @@ struct SVEIntrinsicInfo {
unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();

Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
unsigned IROpcode = 0;

enum PredicationStyle {
Uninitialized,
Expand Down Expand Up @@ -1270,7 +1284,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_fmls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u);
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
.setMatchingIROpcode(Instruction::FMul);
case Intrinsic::aarch64_sve_fmulx:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
Expand All @@ -1286,7 +1301,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_mls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u);
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
.setMatchingIROpcode(Instruction::Mul);
case Intrinsic::aarch64_sve_sabd:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
Expand Down Expand Up @@ -1324,6 +1340,13 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_uqsub:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);

case Intrinsic::aarch64_sve_fmul_u:
return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
Instruction::FMul);
case Intrinsic::aarch64_sve_mul_u:
return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
Instruction::Mul);

case Intrinsic::aarch64_sve_addqv:
case Intrinsic::aarch64_sve_and_z:
case Intrinsic::aarch64_sve_bic_z:
Expand Down Expand Up @@ -2206,45 +2229,63 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
return std::nullopt;
}

static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
IntrinsicInst &II) {
auto *OpPredicate = II.getOperand(0);
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);
// Simplify `V` by only considering the operations that affect active lanes.
// This function should only return existing Values or newly created Constants.
static Value *stripInactiveLanes(Value *V, const Value *Pg) {
auto *Dup = dyn_cast<IntrinsicInst>(V);
if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
return ConstantVector::getSplat(
cast<VectorType>(V->getType())->getElementCount(),
cast<Constant>(Dup->getOperand(2)));

return V;
}

// Return true if a given instruction is a unit splat value, false otherwise.
auto IsUnitSplat = [](auto *I) {
auto *SplatValue = getSplatValue(I);
if (!SplatValue)
return false;
return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};
static std::optional<Instruction *>
instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II,
const SVEIntrinsicInfo &IInfo) {
const unsigned Opc = IInfo.getMatchingIROpode();
if (!Instruction::isBinaryOp(Opc))
return std::nullopt;

// Return true if a given instruction is an aarch64_sve_dup intrinsic call
// with a unit splat value, false otherwise.
auto IsUnitDup = [](auto *I) {
auto *IntrI = dyn_cast<IntrinsicInst>(I);
if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
return false;
Value *Pg = II.getOperand(0);
Value *Op1 = II.getOperand(1);
Value *Op2 = II.getOperand(2);
const DataLayout &DL = II.getDataLayout();

auto *SplatValue = IntrI->getOperand(2);
return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};
// Canonicalise constants to the RHS.
if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&
isa<Constant>(Op1) && !isa<Constant>(Op2)) {
IC.replaceOperand(II, 1, Op2);
IC.replaceOperand(II, 2, Op1);
return &II;
}
Comment on lines +2257 to +2263
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this code would be removed, would this case normally be handled by simplifyBinOp ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This change is not related to simplifyBinOp but exists to mirror the usual canonicalisation performed for commutative binops.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I look at the code-path it takes, then removing this code would always send it down the simplifyBinOp path, and if that would do the canonicalisation, then SimpleII should not be nullptr and it should return that on line 2277. That said, I tried commenting out the code and the canonicalisation doesn't happen.

Copy link
Collaborator Author

@paulwalker-arm paulwalker-arm Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I think I misunderstood the original question? I figured you meant "without this code does simplifyBinOp just work for both mul X, 1 and mul 1, X" of which the answer is yes.

Canonicalisation is not a simplification because simplification can only return an existing operand (or child operand) or generate a new constant. Canonicalising the operands would require simplifyBinOp to create a new instruction with the operands reversed, which it is not allowed to do.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Canonicalisation is not a simplification because simplification can only return an existing operand (or child operand) or generate a new constant.

Thanks, that clarifies it, my question was indeed if this function did the canonicalisation. If the answer is no, this code makes sense.


if (IsUnitSplat(OpMultiplier)) {
// [f]mul pg %n, (dupx 1) => %n
OpMultiplicand->takeName(&II);
return IC.replaceInstUsesWith(II, OpMultiplicand);
} else if (IsUnitDup(OpMultiplier)) {
// [f]mul pg %n, (dup pg 1) => %n
auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
auto *DupPg = DupInst->getOperand(1);
// TODO: this is naive. The optimization is still valid if DupPg
// 'encompasses' OpPredicate, not only if they're the same predicate.
if (OpPredicate == DupPg) {
OpMultiplicand->takeName(&II);
return IC.replaceInstUsesWith(II, OpMultiplicand);
}
// Only active lanes matter when simplifying the operation.
Op1 = stripInactiveLanes(Op1, Pg);
Op2 = stripInactiveLanes(Op2, Pg);
Comment on lines +2265 to +2267
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there value in stripping the inactive lanes before the "Canonicalise constants to the RHS" above, so that the case above may hit more often? (e.g. a case of svmul_x(svdup(..), svdup()))

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The canonicalisation does not relate to simplifyBinOp so I don't think so.

The benefit would be to simplify the writing of future combines to remove the need to check both operands. Is this something you want me to implement now or defer until such combines exist?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's no value at this point, I don't think there's a need.


Value *SimpleII;
if (auto FII = dyn_cast<FPMathOperator>(&II))
SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
else
SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);

if (SimpleII) {
if (IInfo.inactiveLanesAreNotDefined())
return IC.replaceInstUsesWith(II, SimpleII);

Value *Inactive =
II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());

// The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
if (SimpleII == Inactive)
return IC.replaceInstUsesWith(II, SimpleII);

// Inactive lanes must be preserved.
SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
return IC.replaceInstUsesWith(II, SimpleII);
}

return instCombineSVEVectorBinOp(IC, II);
Expand Down Expand Up @@ -2651,9 +2692,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fmul:
return instCombineSVEVectorMul(IC, II);
return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fmul_u:
return instCombineSVEVectorMul(IC, II);
return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
Expand All @@ -2665,9 +2706,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mul:
return instCombineSVEVectorMul(IC, II);
return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_mul_u:
return instCombineSVEVectorMul(IC, II);
return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent fmuls -- should compile to just a ret.
define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_f16(
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
Expand All @@ -16,8 +16,8 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x

define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_f32(
; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
;
%1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
%2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
Expand All @@ -26,8 +26,8 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale

define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_f64(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
%2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
Expand All @@ -37,7 +37,7 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> splat (double 1.000000e+00)
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
Expand All @@ -48,8 +48,8 @@ define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x

define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vsca

define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
; Different argument order to the above tests.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent muls -- should compile to just a ret.
define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_i16(
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
Expand All @@ -16,8 +16,8 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8

define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_i32(
; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
;
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
Expand All @@ -26,8 +26,8 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4

define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_i64(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 2 x i64> [[A]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
Expand All @@ -37,7 +37,7 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 1)
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
Expand All @@ -48,8 +48,8 @@ define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x

define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x

define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
; Different argument order to the above tests.
Expand Down
Loading