Skip to content

Commit f648e0a

Browse files
committed
The initial implementation modeled after VPWidenLoadEVLRecipe.
1 parent fc3a21b commit f648e0a

File tree

5 files changed

+162
-27
lines changed

5 files changed

+162
-27
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
844844
case VPRecipeBase::VPEVLBasedIVPHISC:
845845
case VPRecipeBase::VPExpandSCEVSC:
846846
case VPRecipeBase::VPInstructionSC:
847+
case VPRecipeBase::VPReductionEVLSC:
847848
case VPRecipeBase::VPReductionSC:
848849
case VPRecipeBase::VPReplicateSC:
849850
case VPRecipeBase::VPScalarIVStepsSC:
@@ -2126,6 +2127,12 @@ class VPReductionRecipe : public VPSingleDefRecipe {
21262127
VPSlotTracker &SlotTracker) const override;
21272128
#endif
21282129

2130+
/// Return the recurrence decriptor for the in-loop reduction.
2131+
const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2132+
return RdxDesc;
2133+
}
2134+
/// Return true if the in-loop reduction is ordered.
2135+
bool isOrdered() const { return IsOrdered; };
21292136
/// The VPValue of the scalar Chain being accumulated.
21302137
VPValue *getChainOp() const { return getOperand(0); }
21312138
/// The VPValue of the vector value to be reduced.
@@ -2136,6 +2143,75 @@ class VPReductionRecipe : public VPSingleDefRecipe {
21362143
}
21372144
};
21382145

2146+
/// A recipe to represent inloop reduction operations with vector-predication
2147+
/// intrinsics, performing a reduction on a vector operand with the explicit
2148+
/// vector length (EVL) into a scalar value, and adding the result to a chain.
2149+
/// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
2150+
class VPReductionEVLRecipe : public VPSingleDefRecipe {
2151+
/// The recurrence decriptor for the reduction in question.
2152+
const RecurrenceDescriptor &RdxDesc;
2153+
bool IsOrdered;
2154+
2155+
VPReductionEVLRecipe(const RecurrenceDescriptor &R, Instruction *I,
2156+
VPValue *ChainOp, VPValue *VecOp, VPValue *EVL,
2157+
VPValue *CondOp, bool IsOrdered)
2158+
: VPSingleDefRecipe(VPDef::VPReductionEVLSC,
2159+
ArrayRef<VPValue *>({ChainOp, VecOp, EVL}), I),
2160+
RdxDesc(R), IsOrdered(IsOrdered) {
2161+
if (CondOp)
2162+
addOperand(CondOp);
2163+
}
2164+
2165+
public:
2166+
VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL)
2167+
: VPSingleDefRecipe(
2168+
VPDef::VPReductionEVLSC,
2169+
ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}),
2170+
R->getUnderlyingInstr()),
2171+
RdxDesc(R->getRecurrenceDescriptor()), IsOrdered(R->isOrdered()) {
2172+
VPValue *CondOp = R->getCondOp();
2173+
if (CondOp)
2174+
addOperand(CondOp);
2175+
};
2176+
2177+
~VPReductionEVLRecipe() override = default;
2178+
2179+
VPReductionEVLRecipe *clone() override {
2180+
return new VPReductionEVLRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
2181+
getVecOp(), getEVL(), getCondOp(),
2182+
IsOrdered);
2183+
}
2184+
2185+
VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
2186+
2187+
/// Generate the reduction in the loop
2188+
void execute(VPTransformState &State) override;
2189+
2190+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2191+
/// Print the recipe.
2192+
void print(raw_ostream &O, const Twine &Indent,
2193+
VPSlotTracker &SlotTracker) const override;
2194+
#endif
2195+
2196+
/// The VPValue of the scalar Chain being accumulated.
2197+
VPValue *getChainOp() const { return getOperand(0); }
2198+
/// The VPValue of the vector value to be reduced.
2199+
VPValue *getVecOp() const { return getOperand(1); }
2200+
/// The VPValue of the explicit vector length.
2201+
VPValue *getEVL() const { return getOperand(2); }
2202+
/// The VPValue of the condition for the block.
2203+
VPValue *getCondOp() const {
2204+
return getNumOperands() > 3 ? getOperand(3) : nullptr;
2205+
}
2206+
2207+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2208+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2209+
assert(is_contained(operands(), Op) &&
2210+
"Op must be an operand of the recipe");
2211+
return Op == getEVL();
2212+
}
2213+
};
2214+
21392215
/// VPReplicateRecipe replicates a given instruction producing multiple scalar
21402216
/// copies of the original scalar type, one per lane, instead of producing a
21412217
/// single copy of widened type for all lanes. If the instruction is known to be

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,36 @@ void VPReductionRecipe::execute(VPTransformState &State) {
15901590
}
15911591
}
15921592

1593+
void VPReductionEVLRecipe::execute(VPTransformState &State) {
1594+
assert(!State.Instance && "Reduction being replicated.");
1595+
assert(State.UF == 1 &&
1596+
"Expected only UF == 1 when vectorizing with explicit vector length.");
1597+
1598+
auto &Builder = State.Builder;
1599+
// Propagate the fast-math flags carried by the underlying instruction.
1600+
IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1601+
Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1602+
1603+
RecurKind Kind = RdxDesc.getRecurrenceKind();
1604+
Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1605+
Value *VecOp = State.get(getVecOp(), 0);
1606+
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1607+
Value *Cond = getCondOp() ? State.get(getCondOp(), 0) : nullptr;
1608+
1609+
Value *NewRed;
1610+
if (IsOrdered) {
1611+
NewRed = createOrderedReduction(Builder, RdxDesc, VecOp, Prev, EVL, Cond);
1612+
} else {
1613+
NewRed = createSimpleTargetReduction(Builder, VecOp, Kind, EVL, Cond);
1614+
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1615+
NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1616+
else
1617+
NewRed = Builder.CreateBinOp(
1618+
(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1619+
}
1620+
State.set(this, NewRed, 0, /*IsScalar*/ true);
1621+
}
1622+
15931623
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
15941624
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
15951625
VPSlotTracker &SlotTracker) const {
@@ -1611,6 +1641,29 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
16111641
O << " (with final reduction value stored in invariant address sank "
16121642
"outside of loop)";
16131643
}
1644+
1645+
void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1646+
VPSlotTracker &SlotTracker) const {
1647+
O << Indent << "REDUCE ";
1648+
printAsOperand(O, SlotTracker);
1649+
O << " = ";
1650+
getChainOp()->printAsOperand(O, SlotTracker);
1651+
O << " +";
1652+
if (isa<FPMathOperator>(getUnderlyingInstr()))
1653+
O << getUnderlyingInstr()->getFastMathFlags();
1654+
O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1655+
getVecOp()->printAsOperand(O, SlotTracker);
1656+
O << ", ";
1657+
getEVL()->printAsOperand(O, SlotTracker);
1658+
if (getCondOp()) {
1659+
O << ", ";
1660+
getCondOp()->printAsOperand(O, SlotTracker);
1661+
}
1662+
O << ")";
1663+
if (RdxDesc.IntermediateStore)
1664+
O << " (with final reduction value stored in invariant address sank "
1665+
"outside of loop)";
1666+
}
16141667
#endif
16151668

16161669
bool VPReplicateRecipe::shouldPack() const {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,23 +1337,29 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
13371337

13381338
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
13391339
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
1340-
auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
1341-
if (!MemR)
1342-
continue;
1343-
VPValue *OrigMask = MemR->getMask();
1344-
assert(OrigMask && "Unmasked widen memory recipe when folding tail");
1345-
VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
1346-
if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
1347-
auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
1348-
N->insertBefore(L);
1349-
L->replaceAllUsesWith(N);
1350-
L->eraseFromParent();
1351-
} else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
1352-
auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
1353-
N->insertBefore(S);
1354-
S->eraseFromParent();
1355-
} else {
1356-
llvm_unreachable("unsupported recipe");
1340+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
1341+
if (!MemR)
1342+
continue;
1343+
VPValue *OrigMask = MemR->getMask();
1344+
assert(OrigMask && "Unmasked widen memory recipe when folding tail");
1345+
VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
1346+
if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
1347+
auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
1348+
N->insertBefore(L);
1349+
L->replaceAllUsesWith(N);
1350+
L->eraseFromParent();
1351+
} else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
1352+
auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
1353+
N->insertBefore(S);
1354+
S->eraseFromParent();
1355+
} else {
1356+
llvm_unreachable("unsupported recipe");
1357+
}
1358+
} else if (auto *RedR = dyn_cast<VPReductionRecipe>(U)) {
1359+
auto *N = new VPReductionEVLRecipe(RedR, VPEVL);
1360+
N->insertBefore(RedR);
1361+
RedR->replaceAllUsesWith(N);
1362+
RedR->eraseFromParent();
13571363
}
13581364
}
13591365
recursivelyDeleteDeadRecipes(HeaderMask);

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ class VPDef {
347347
VPExpandSCEVSC,
348348
VPInstructionSC,
349349
VPInterleaveSC,
350+
VPReductionEVLSC,
350351
VPReductionSC,
351352
VPReplicateSC,
352353
VPScalarCastSC,

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
209209
; IF-EVL-INLOOP: vector.body:
210210
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
211211
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
212-
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
212+
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
213213
; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
214214
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP7]], i32 8, i1 true)
215215
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[EVL_BASED_IV]], 0
@@ -223,31 +223,30 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
223223
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
224224
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP14]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP8]])
225225
; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
226-
; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i32> [[TMP15]], <vscale x 8 x i32> zeroinitializer
227-
; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP16]])
228-
; IF-EVL-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]]
226+
; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP15]], <vscale x 8 x i1> [[TMP12]], i32 [[TMP8]])
227+
; IF-EVL-INLOOP-NEXT: [[TMP17]] = add i32 [[TMP16]], [[VEC_PHI]]
229228
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP8]], [[EVL_BASED_IV]]
230229
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
231-
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
232-
; IF-EVL-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
230+
; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
231+
; IF-EVL-INLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
233232
; IF-EVL-INLOOP: middle.block:
234233
; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
235234
; IF-EVL-INLOOP: scalar.ph:
236235
; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
237-
; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
236+
; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
238237
; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
239238
; IF-EVL-INLOOP: for.body:
240239
; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
241240
; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
242241
; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
243-
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
244-
; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32
242+
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
243+
; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32
245244
; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
246245
; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
247246
; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
248247
; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
249248
; IF-EVL-INLOOP: for.cond.cleanup.loopexit:
250-
; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
249+
; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
251250
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
252251
; IF-EVL-INLOOP: for.cond.cleanup:
253252
; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]

0 commit comments

Comments
 (0)