Skip to content

Commit 41675fa

Browse files
authored
[VPlan] Simplify vp.merge true, (or x, y), x -> vp.merge y, true, x (#135017)
With EVL tail folding an AnyOf reduction will emit an i1 vp.merge like vp.merge true, (or phi, cond), phi, evl We can remove the or and optimise this to vp.merge cond, true, phi, evl Which makes it slightly easier to pattern match in #134898. This also adds a pattern matcher for calls to help match this. Blended AnyOf reductions will use an and instead of an or, which we may also be able to simplify in a later patch.
1 parent 45f2716 commit 41675fa

File tree

4 files changed

+134
-8
lines changed

4 files changed

+134
-8
lines changed

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,29 @@ template <typename LTy, typename RTy> struct match_combine_or {
166166
}
167167
};
168168

169+
template <typename LTy, typename RTy> struct match_combine_and {
170+
LTy L;
171+
RTy R;
172+
173+
match_combine_and(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}
174+
175+
template <typename ITy> bool match(ITy *V) const {
176+
return L.match(V) && R.match(V);
177+
}
178+
};
179+
180+
/// Combine two pattern matchers matching L || R
169181
template <typename LTy, typename RTy>
170182
inline match_combine_or<LTy, RTy> m_CombineOr(const LTy &L, const RTy &R) {
171183
return match_combine_or<LTy, RTy>(L, R);
172184
}
173185

186+
/// Combine two pattern matchers matching L && R
187+
template <typename LTy, typename RTy>
188+
inline match_combine_and<LTy, RTy> m_CombineAnd(const LTy &L, const RTy &R) {
189+
return match_combine_and<LTy, RTy>(L, R);
190+
}
191+
174192
/// Match a VPValue, capturing it if we match.
175193
inline bind_ty<VPValue> m_VPValue(VPValue *&V) { return V; }
176194

@@ -469,6 +487,106 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
469487
return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});
470488
}
471489

490+
/// Match a call argument at a given argument index.
491+
template <typename Opnd_t> struct Argument_match {
492+
/// Call argument index to match.
493+
unsigned OpI;
494+
Opnd_t Val;
495+
496+
Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) {}
497+
498+
template <typename OpTy> bool match(OpTy *V) const {
499+
if (const auto *R = dyn_cast<VPWidenIntrinsicRecipe>(V))
500+
return Val.match(R->getOperand(OpI));
501+
if (const auto *R = dyn_cast<VPWidenCallRecipe>(V))
502+
return Val.match(R->getOperand(OpI));
503+
if (const auto *R = dyn_cast<VPReplicateRecipe>(V))
504+
if (isa<CallInst>(R->getUnderlyingInstr()))
505+
return Val.match(R->getOperand(OpI + 1));
506+
return false;
507+
}
508+
};
509+
510+
/// Match a call argument.
511+
template <unsigned OpI, typename Opnd_t>
512+
inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
513+
return Argument_match<Opnd_t>(OpI, Op);
514+
}
515+
516+
/// Intrinsic matchers.
517+
struct IntrinsicID_match {
518+
unsigned ID;
519+
520+
IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) {}
521+
522+
template <typename OpTy> bool match(OpTy *V) const {
523+
if (const auto *R = dyn_cast<VPWidenIntrinsicRecipe>(V))
524+
return R->getVectorIntrinsicID() == ID;
525+
if (const auto *R = dyn_cast<VPWidenCallRecipe>(V))
526+
return R->getCalledScalarFunction()->getIntrinsicID() == ID;
527+
if (const auto *R = dyn_cast<VPReplicateRecipe>(V))
528+
if (const auto *CI = dyn_cast<CallInst>(R->getUnderlyingInstr()))
529+
if (const auto *F = CI->getCalledFunction())
530+
return F->getIntrinsicID() == ID;
531+
return false;
532+
}
533+
};
534+
535+
/// Intrinsic matches are combinations of ID matchers, and argument
536+
/// matchers. Higher arity matcher are defined recursively in terms of and-ing
537+
/// them with lower arity matchers. Here's some convenient typedefs for up to
538+
/// several arguments, and more can be added as needed
539+
template <typename T0 = void, typename T1 = void, typename T2 = void,
540+
typename T3 = void>
541+
struct m_Intrinsic_Ty;
542+
template <typename T0> struct m_Intrinsic_Ty<T0> {
543+
using Ty = match_combine_and<IntrinsicID_match, Argument_match<T0>>;
544+
};
545+
template <typename T0, typename T1> struct m_Intrinsic_Ty<T0, T1> {
546+
using Ty =
547+
match_combine_and<typename m_Intrinsic_Ty<T0>::Ty, Argument_match<T1>>;
548+
};
549+
template <typename T0, typename T1, typename T2>
550+
struct m_Intrinsic_Ty<T0, T1, T2> {
551+
using Ty = match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
552+
Argument_match<T2>>;
553+
};
554+
template <typename T0, typename T1, typename T2, typename T3>
555+
struct m_Intrinsic_Ty {
556+
using Ty = match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
557+
Argument_match<T3>>;
558+
};
559+
560+
/// Match intrinsic calls like this:
561+
/// m_Intrinsic<Intrinsic::fabs>(m_VPValue(X), ...)
562+
template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
563+
return IntrinsicID_match(IntrID);
564+
}
565+
566+
template <Intrinsic::ID IntrID, typename T0>
567+
inline typename m_Intrinsic_Ty<T0>::Ty m_Intrinsic(const T0 &Op0) {
568+
return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));
569+
}
570+
571+
template <Intrinsic::ID IntrID, typename T0, typename T1>
572+
inline typename m_Intrinsic_Ty<T0, T1>::Ty m_Intrinsic(const T0 &Op0,
573+
const T1 &Op1) {
574+
return m_CombineAnd(m_Intrinsic<IntrID>(Op0), m_Argument<1>(Op1));
575+
}
576+
577+
template <Intrinsic::ID IntrID, typename T0, typename T1, typename T2>
578+
inline typename m_Intrinsic_Ty<T0, T1, T2>::Ty
579+
m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2) {
580+
return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1), m_Argument<2>(Op2));
581+
}
582+
583+
template <Intrinsic::ID IntrID, typename T0, typename T1, typename T2,
584+
typename T3>
585+
inline typename m_Intrinsic_Ty<T0, T1, T2, T3>::Ty
586+
m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2, const T3 &Op3) {
587+
return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1, Op2), m_Argument<3>(Op3));
588+
}
589+
472590
} // namespace VPlanPatternMatch
473591
} // namespace llvm
474592

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,18 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
10261026
if (TypeInfo.inferScalarType(X) != WideStepTy)
10271027
X = VPBuilder(&R).createWidenCast(Instruction::Trunc, X, WideStepTy);
10281028
R.getVPSingleValue()->replaceAllUsesWith(X);
1029+
return;
1030+
}
1031+
1032+
// For i1 vp.merges produced by AnyOf reductions:
1033+
// vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1034+
if (match(&R, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1035+
m_VPValue(X), m_VPValue())) &&
1036+
match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
1037+
TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1038+
R.setOperand(1, R.getOperand(0));
1039+
R.setOperand(0, Y);
1040+
return;
10291041
}
10301042
}
10311043

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,8 +1899,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
18991899
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
19001900
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
19011901
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
1902-
; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
1903-
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
1902+
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
19041903
; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
19051904
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
19061905
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -2024,8 +2023,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
20242023
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
20252024
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
20262025
; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
2027-
; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
2028-
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
2026+
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
20292027
; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
20302028
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
20312029
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,8 +1953,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
19531953
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
19541954
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
19551955
; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
1956-
; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
1957-
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
1956+
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
19581957
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
19591958
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
19601959
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -2078,8 +2077,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
20782077
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
20792078
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
20802079
; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
2081-
; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
2082-
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
2080+
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
20832081
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
20842082
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
20852083
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]

0 commit comments

Comments
 (0)