-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. #134116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
After 126928 it's now possible to rewrite the existing combines, which mostly only handle cases where a operand is an identity value, to use existing simplify code to unlock general constant folding.
@llvm/pr-subscribers-llvm-transforms Author: Paul Walker (paulwalker-arm) ChangesAfter #126928 it's now possible to rewrite the existing combines, which mostly only handle cases where a operand is an identity value, to use existing simplify code to unlock general constant folding. NOTE: There is nothing here specifically limited to multiply but because there are existing tests I figured I'd limit myself to those cases and then enable more operations incrementally. For similar reasons I have retained the current name of NOTE: Given there's no longer any dedicated code relating to the original work I plan to follow up with a patch to remove the sve-intrinsic-*-idempotency.ll tests. Patch is 27.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134116.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 823d77251a796..e43e2beb564c7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1112,6 +1112,19 @@ struct SVEIntrinsicInfo {
return *this;
}
+ bool hasMatchingIROpode() const { return IROpcode != 0; }
+
+ unsigned getMatchingIROpode() const {
+ assert(hasMatchingIROpode() && "Propery not set!");
+ return IROpcode;
+ }
+
+ SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
+ assert(!hasMatchingIROpode() && "Cannot set property twice!");
+ IROpcode = Opcode;
+ return *this;
+ }
+
//
// Properties relating to the result of inactive lanes.
//
@@ -1187,6 +1200,7 @@ struct SVEIntrinsicInfo {
unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
+ unsigned IROpcode = 0;
enum PredicationStyle {
Uninitialized,
@@ -1270,7 +1284,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_fmls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
- return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u);
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
+ .setMatchingIROpcode(Instruction::FMul);
case Intrinsic::aarch64_sve_fmulx:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
@@ -1286,7 +1301,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_mls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
- return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u);
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
+ .setMatchingIROpcode(Instruction::Mul);
case Intrinsic::aarch64_sve_sabd:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
@@ -1324,6 +1340,13 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_uqsub:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
+ case Intrinsic::aarch64_sve_fmul_u:
+ return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+ Instruction::FMul);
+ case Intrinsic::aarch64_sve_mul_u:
+ return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+ Instruction::Mul);
+
case Intrinsic::aarch64_sve_addqv:
case Intrinsic::aarch64_sve_and_z:
case Intrinsic::aarch64_sve_bic_z:
@@ -2206,45 +2229,63 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
return std::nullopt;
}
-static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
- IntrinsicInst &II) {
- auto *OpPredicate = II.getOperand(0);
- auto *OpMultiplicand = II.getOperand(1);
- auto *OpMultiplier = II.getOperand(2);
+// Simplify `V` by only considering the operations that affect active lanes.
+// This function should only return existing Values or newly created Constants.
+static Value *stripInactiveLanes(Value *V, const Value *Pg) {
+ auto *Dup = dyn_cast<IntrinsicInst>(V);
+ if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
+ Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
+ return ConstantVector::getSplat(
+ cast<VectorType>(V->getType())->getElementCount(),
+ cast<Constant>(Dup->getOperand(2)));
+
+ return V;
+}
- // Return true if a given instruction is a unit splat value, false otherwise.
- auto IsUnitSplat = [](auto *I) {
- auto *SplatValue = getSplatValue(I);
- if (!SplatValue)
- return false;
- return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
- };
+static std::optional<Instruction *>
+instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II,
+ const SVEIntrinsicInfo &IInfo) {
+ const unsigned Opc = IInfo.getMatchingIROpode();
+ if (!Instruction::isBinaryOp(Opc))
+ return std::nullopt;
- // Return true if a given instruction is an aarch64_sve_dup intrinsic call
- // with a unit splat value, false otherwise.
- auto IsUnitDup = [](auto *I) {
- auto *IntrI = dyn_cast<IntrinsicInst>(I);
- if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
- return false;
+ Value *Pg = II.getOperand(0);
+ Value *Op1 = II.getOperand(1);
+ Value *Op2 = II.getOperand(2);
+ const DataLayout &DL = II.getDataLayout();
- auto *SplatValue = IntrI->getOperand(2);
- return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
- };
+ // Canonicalise constants to the RHS.
+ if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&
+ isa<Constant>(Op1) && !isa<Constant>(Op2)) {
+ IC.replaceOperand(II, 1, Op2);
+ IC.replaceOperand(II, 2, Op1);
+ return &II;
+ }
- if (IsUnitSplat(OpMultiplier)) {
- // [f]mul pg %n, (dupx 1) => %n
- OpMultiplicand->takeName(&II);
- return IC.replaceInstUsesWith(II, OpMultiplicand);
- } else if (IsUnitDup(OpMultiplier)) {
- // [f]mul pg %n, (dup pg 1) => %n
- auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
- auto *DupPg = DupInst->getOperand(1);
- // TODO: this is naive. The optimization is still valid if DupPg
- // 'encompasses' OpPredicate, not only if they're the same predicate.
- if (OpPredicate == DupPg) {
- OpMultiplicand->takeName(&II);
- return IC.replaceInstUsesWith(II, OpMultiplicand);
- }
+ // Only active lanes matter when simplifying the operation.
+ Op1 = stripInactiveLanes(Op1, Pg);
+ Op2 = stripInactiveLanes(Op2, Pg);
+
+ Value *SimpleII;
+ if (auto FII = dyn_cast<FPMathOperator>(&II))
+ SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
+ else
+ SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
+
+ if (SimpleII) {
+ if (IInfo.inactiveLanesAreNotDefined())
+ return IC.replaceInstUsesWith(II, SimpleII);
+
+ Value *Inactive =
+ II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
+
+ // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
+ if (SimpleII == Inactive)
+ return IC.replaceInstUsesWith(II, SimpleII);
+
+ // Inactive lanes must be preserved.
+ SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
+ return IC.replaceInstUsesWith(II, SimpleII);
}
return instCombineSVEVectorBinOp(IC, II);
@@ -2651,9 +2692,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fmul:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fmul_u:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
@@ -2665,9 +2706,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mul:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_mul_u:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
index f612e5a44ebba..3b37e2c1fddef 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent fmuls -- should compile to just a ret.
define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_f16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x
define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_f32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
;
%1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
%2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_f64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
%2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> splat (double 1.000000e+00)
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
@@ -48,8 +48,8 @@ define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x
define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
index bd3d7be0a1b80..38ed4272d826c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vsca
define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
-; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
; Different argument order to the above tests.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
index cbdcfc6b110b3..602db4eb1d429 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent muls -- should compile to just a ret.
define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_i16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8
define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_i32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
;
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4
define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_i64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x i64> [[A]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 1)
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
@@ -48,8 +48,8 @@ define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x
define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
index 8144e56b979f0..e899c787aa555 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x
define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
-; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
; Different argument order to the above tests.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
new file mode 100644
index 0000000000000..7da55a199df28
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; The follow tests verify the mechanics of simplification. The operation is not
+; important beyond being commutative with a known identity value.
+
+define <vscale x 4 x i32> @commute_constant_to_rhs(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @commute_constant_to_rhs(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[R:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 303))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 303), <vscale x...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesAfter #126928 it's now possible to rewrite the existing combines, which mostly only handle cases where a operand is an identity value, to use existing simplify code to unlock general constant folding. NOTE: There is nothing here specifically limited to multiply but because there are existing tests I figured I'd limit myself to those cases and then enable more operations incrementally. For similar reasons I have retained the current name of NOTE: Given there's no longer any dedicated code relating to the original work I plan to follow up with a patch to remove the sve-intrinsic-*-idempotency.ll tests. Patch is 27.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134116.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 823d77251a796..e43e2beb564c7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1112,6 +1112,19 @@ struct SVEIntrinsicInfo {
return *this;
}
+ bool hasMatchingIROpode() const { return IROpcode != 0; }
+
+ unsigned getMatchingIROpode() const {
+ assert(hasMatchingIROpode() && "Propery not set!");
+ return IROpcode;
+ }
+
+ SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
+ assert(!hasMatchingIROpode() && "Cannot set property twice!");
+ IROpcode = Opcode;
+ return *this;
+ }
+
//
// Properties relating to the result of inactive lanes.
//
@@ -1187,6 +1200,7 @@ struct SVEIntrinsicInfo {
unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
+ unsigned IROpcode = 0;
enum PredicationStyle {
Uninitialized,
@@ -1270,7 +1284,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_fmls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
- return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u);
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
+ .setMatchingIROpcode(Instruction::FMul);
case Intrinsic::aarch64_sve_fmulx:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
@@ -1286,7 +1301,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_mls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
- return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u);
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
+ .setMatchingIROpcode(Instruction::Mul);
case Intrinsic::aarch64_sve_sabd:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
@@ -1324,6 +1340,13 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_uqsub:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
+ case Intrinsic::aarch64_sve_fmul_u:
+ return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+ Instruction::FMul);
+ case Intrinsic::aarch64_sve_mul_u:
+ return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+ Instruction::Mul);
+
case Intrinsic::aarch64_sve_addqv:
case Intrinsic::aarch64_sve_and_z:
case Intrinsic::aarch64_sve_bic_z:
@@ -2206,45 +2229,63 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
return std::nullopt;
}
-static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
- IntrinsicInst &II) {
- auto *OpPredicate = II.getOperand(0);
- auto *OpMultiplicand = II.getOperand(1);
- auto *OpMultiplier = II.getOperand(2);
+// Simplify `V` by only considering the operations that affect active lanes.
+// This function should only return existing Values or newly created Constants.
+static Value *stripInactiveLanes(Value *V, const Value *Pg) {
+ auto *Dup = dyn_cast<IntrinsicInst>(V);
+ if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
+ Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
+ return ConstantVector::getSplat(
+ cast<VectorType>(V->getType())->getElementCount(),
+ cast<Constant>(Dup->getOperand(2)));
+
+ return V;
+}
- // Return true if a given instruction is a unit splat value, false otherwise.
- auto IsUnitSplat = [](auto *I) {
- auto *SplatValue = getSplatValue(I);
- if (!SplatValue)
- return false;
- return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
- };
+static std::optional<Instruction *>
+instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II,
+ const SVEIntrinsicInfo &IInfo) {
+ const unsigned Opc = IInfo.getMatchingIROpode();
+ if (!Instruction::isBinaryOp(Opc))
+ return std::nullopt;
- // Return true if a given instruction is an aarch64_sve_dup intrinsic call
- // with a unit splat value, false otherwise.
- auto IsUnitDup = [](auto *I) {
- auto *IntrI = dyn_cast<IntrinsicInst>(I);
- if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
- return false;
+ Value *Pg = II.getOperand(0);
+ Value *Op1 = II.getOperand(1);
+ Value *Op2 = II.getOperand(2);
+ const DataLayout &DL = II.getDataLayout();
- auto *SplatValue = IntrI->getOperand(2);
- return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
- };
+ // Canonicalise constants to the RHS.
+ if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&
+ isa<Constant>(Op1) && !isa<Constant>(Op2)) {
+ IC.replaceOperand(II, 1, Op2);
+ IC.replaceOperand(II, 2, Op1);
+ return &II;
+ }
- if (IsUnitSplat(OpMultiplier)) {
- // [f]mul pg %n, (dupx 1) => %n
- OpMultiplicand->takeName(&II);
- return IC.replaceInstUsesWith(II, OpMultiplicand);
- } else if (IsUnitDup(OpMultiplier)) {
- // [f]mul pg %n, (dup pg 1) => %n
- auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
- auto *DupPg = DupInst->getOperand(1);
- // TODO: this is naive. The optimization is still valid if DupPg
- // 'encompasses' OpPredicate, not only if they're the same predicate.
- if (OpPredicate == DupPg) {
- OpMultiplicand->takeName(&II);
- return IC.replaceInstUsesWith(II, OpMultiplicand);
- }
+ // Only active lanes matter when simplifying the operation.
+ Op1 = stripInactiveLanes(Op1, Pg);
+ Op2 = stripInactiveLanes(Op2, Pg);
+
+ Value *SimpleII;
+ if (auto FII = dyn_cast<FPMathOperator>(&II))
+ SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
+ else
+ SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
+
+ if (SimpleII) {
+ if (IInfo.inactiveLanesAreNotDefined())
+ return IC.replaceInstUsesWith(II, SimpleII);
+
+ Value *Inactive =
+ II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
+
+ // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
+ if (SimpleII == Inactive)
+ return IC.replaceInstUsesWith(II, SimpleII);
+
+ // Inactive lanes must be preserved.
+ SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
+ return IC.replaceInstUsesWith(II, SimpleII);
}
return instCombineSVEVectorBinOp(IC, II);
@@ -2651,9 +2692,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fmul:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fmul_u:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
@@ -2665,9 +2706,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mul:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_mul_u:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
index f612e5a44ebba..3b37e2c1fddef 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent fmuls -- should compile to just a ret.
define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_f16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x
define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_f32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
;
%1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
%2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_f64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
%2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> splat (double 1.000000e+00)
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
@@ -48,8 +48,8 @@ define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x
define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
index bd3d7be0a1b80..38ed4272d826c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vsca
define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
-; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
; Different argument order to the above tests.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
index cbdcfc6b110b3..602db4eb1d429 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent muls -- should compile to just a ret.
define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_i16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8
define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_i32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
;
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4
define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_i64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x i64> [[A]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_different_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 1)
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
@@ -48,8 +48,8 @@ define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x
define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
index 8144e56b979f0..e899c787aa555 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x
define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
-; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
; Different argument order to the above tests.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
new file mode 100644
index 0000000000000..7da55a199df28
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; The follow tests verify the mechanics of simplification. The operation is not
+; important beyond being commutative with a known identity value.
+
+define <vscale x 4 x i32> @commute_constant_to_rhs(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @commute_constant_to_rhs(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[R:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 303))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 303), <vscale x...
[truncated]
|
@@ -1112,6 +1112,19 @@ struct SVEIntrinsicInfo { | |||
return *this; | |||
} | |||
|
|||
bool hasMatchingIROpode() const { return IROpcode != 0; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's an existing comment above that hopefully gives enough information but please shout if you feel something more bespoke is required.
} | ||
|
||
SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) { | ||
assert(!hasMatchingIROpode() && "Cannot set property twice!"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I'm not sure I see the value of this assert, as I could imagine a use-case for someone wanting override a previously set opcode?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I consider that a bug. The intent is for an intrinsic's SVEIntrinsicInfo
to be constant and I have a hope to one day make this static rather than having to recompute it as is done today.
// Only active lanes matter when simplifying the operation. | ||
Op1 = stripInactiveLanes(Op1, Pg); | ||
Op2 = stripInactiveLanes(Op2, Pg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there value in stripping the inactive lanes before the "Canonicalise constants to the RHS" above, so that the case above may hit more often? (e.g. a case of svmul_x(svdup(..), svdup())
)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The canonicalisation does not relate to simplifyBinOp
so I don't think so.
The benefit would be to simplify the writing of future combines to remove the need to check both operands. Is this something you want me to implement now or defer until such combines exist?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there's no value at this point, I don't think there's a need.
// Canonicalise constants to the RHS. | ||
if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() && | ||
isa<Constant>(Op1) && !isa<Constant>(Op2)) { | ||
IC.replaceOperand(II, 1, Op2); | ||
IC.replaceOperand(II, 2, Op1); | ||
return &II; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this code would be removed, would this case normally be handled by simplifyBinOp
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. This change is not related to simplifyBinOp
but exists to mirror the usual canonicalisation performed for commutative binops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I look at the code-path it takes, then removing this code would always send it down the simplifyBinOp
path, and if that would do the canonicalisation, then SimpleII
should not be nullptr and it should return that on line 2277. That said, I tried commenting out the code and the canonicalisation doesn't happen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I think I misunderstood the original question? I figured you meant "without this code does simplifyBinOp
just work for both mul X, 1
and mul 1, X
" of which the answer is yes.
Canonicalisation is not a simplification because simplification can only return an existing operand (or child operand) or generate a new constant. Canonicalising the operands would require simplifyBinOp
to create a new instruction with the operands reversed, which it is not allowed to do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Canonicalisation is not a simplification because simplification can only return an existing operand (or child operand) or generate a new constant.
Thanks, that clarifies it, my question was indeed if this function did the canonicalisation. If the answer is no, this code makes sense.
// Canonicalise constants to the RHS. | ||
if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() && | ||
isa<Constant>(Op1) && !isa<Constant>(Op2)) { | ||
IC.replaceOperand(II, 1, Op2); | ||
IC.replaceOperand(II, 2, Op1); | ||
return &II; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Canonicalisation is not a simplification because simplification can only return an existing operand (or child operand) or generate a new constant.
Thanks, that clarifies it, my question was indeed if this function did the canonicalisation. If the answer is no, this code makes sense.
After #126928 it's now possible to rewrite the existing combines, which mostly only handle cases where a operand is an identity value, to use existing simplify code to unlock general constant folding.
NOTE: There is nothing here specifically limited to multiply but because there are existing tests I figured I'd limit myself to those cases and then enable more operations incrementally. For similar reasons I have retained the current name of
instCombineSVEVectorMul
becauseinstCombineSVEVectorBinOp
is already in use and I'd like to refactor things to free that name up.NOTE: Given there's no longer any dedicated code relating to the original work I plan to follow up with a patch to remove the sve-intrinsic-*-idempotency.ll tests.