Skip to content

Commit ca9c56d

Browse files
committed
Step 3: Expand ExtractFromEnd to sub + extractelement
This transformation is merely to avoid potential issues. The current folding by EVL is developed based on folding by mask. As a result, ResumePhi should be dead code. I just made this change so I can sleep better at night :P
1 parent fdf9fc0 commit ca9c56d

File tree

6 files changed

+78
-26
lines changed

6 files changed

+78
-26
lines changed

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
7878
case VPInstruction::CanonicalIVIncrementForPart:
7979
case VPInstruction::AnyOf:
8080
return SetResultTyFromOp();
81+
case Instruction::ExtractElement:
8182
case VPInstruction::ExtractFirstActive:
8283
case VPInstruction::ExtractFromEnd: {
8384
Type *BaseTy = inferScalarType(R->getOperand(0));

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
478478
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
479479
return Builder.CreateSelect(Cond, Op1, Op2, Name);
480480
}
481+
case Instruction::ExtractElement: {
482+
assert(State.VF.isVector() && "Only extract elements from vectors");
483+
Value *Vec = State.get(getOperand(0));
484+
Value *Idx = State.get(getOperand(1), /*IsScalar*/ true);
485+
return Builder.CreateExtractElement(Vec, Idx, Name);
486+
}
481487
case VPInstruction::ActiveLaneMask: {
482488
// Get first lane of vector induction variable.
483489
Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
@@ -752,7 +758,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
752758
}
753759

754760
bool VPInstruction::isVectorToScalar() const {
755-
return getOpcode() == VPInstruction::ExtractFromEnd ||
761+
return getOpcode() == Instruction::ExtractElement ||
762+
getOpcode() == VPInstruction::ExtractFromEnd ||
756763
getOpcode() == VPInstruction::ExtractFirstActive ||
757764
getOpcode() == VPInstruction::ComputeReductionResult ||
758765
getOpcode() == VPInstruction::AnyOf;
@@ -814,6 +821,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
814821
switch (getOpcode()) {
815822
case Instruction::ICmp:
816823
case Instruction::Select:
824+
case Instruction::ExtractElement:
817825
case VPInstruction::AnyOf:
818826
case VPInstruction::CalculateTripCountMinusVF:
819827
case VPInstruction::CanonicalIVIncrementForPart:
@@ -851,6 +859,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
851859
case VPInstruction::BranchOnCond:
852860
case VPInstruction::ResumePhi:
853861
return true;
862+
case Instruction::ExtractElement:
863+
return Op == getOperand(1);
854864
};
855865
llvm_unreachable("switch should return");
856866
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1635,6 +1635,48 @@ void VPlanTransforms::addActiveLaneMask(
16351635
HeaderMask->replaceAllUsesWith(LaneMask);
16361636
}
16371637

1638+
/// Adjust the way the resume value is obtained when using tail folding by EVL.
1639+
/// Expanding ExtractFromEnd since the penultimate EVL could not equals to
1640+
/// VFxUF. Expand
1641+
/// %resume = ExtractFromEnd %vec, 1
1642+
/// to
1643+
/// %last.active.idx = sub %EVL, 1
1644+
/// %resume = extractelement %vec, %last.active.idx
1645+
static void adjustResumePhisForEVL(VPlan &Plan, VPValue &EVL) {
1646+
LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1647+
using namespace VPlanPatternMatch;
1648+
for (VPRecipeBase &R : *cast<VPBasicBlock>(Plan.getScalarPreheader())) {
1649+
VPValue *FromMiddleBlock;
1650+
if (!match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
1651+
m_VPValue(FromMiddleBlock), m_VPValue())))
1652+
continue;
1653+
1654+
VPValue *ExtractFrom;
1655+
if (match(FromMiddleBlock, m_VPInstruction<VPInstruction::ExtractFromEnd>(
1656+
m_VPValue(ExtractFrom), m_SpecificInt(1)))) {
1657+
// Skip if all elements are uniform.
1658+
if (vputils::isUniformAfterVectorization(ExtractFrom))
1659+
continue;
1660+
auto *ExtractR = cast<VPInstruction>(FromMiddleBlock);
1661+
VPBuilder Builder(ExtractR);
1662+
VPValue *OneVPV =
1663+
Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(Ctx), 1));
1664+
VPValue *LastActiveIdx =
1665+
Builder.createNaryOp(Instruction::Sub, {&EVL, OneVPV},
1666+
ExtractR->getDebugLoc(), "last.active.idx");
1667+
VPValue *NewExtract = Builder.createNaryOp(
1668+
Instruction::ExtractElement, {ExtractFrom, LastActiveIdx},
1669+
ExtractR->getDebugLoc(), ExtractR->getName());
1670+
ExtractR->replaceAllUsesWith(NewExtract);
1671+
ExtractR->eraseFromParent();
1672+
}
1673+
assert((!dyn_cast<VPInstruction>(FromMiddleBlock) ||
1674+
cast<VPInstruction>(FromMiddleBlock)->getOpcode() !=
1675+
VPInstruction::ExtractFromEnd) &&
1676+
"Only extract the last lane for resumed values");
1677+
}
1678+
}
1679+
16381680
/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
16391681
/// nullptr if no EVL-based recipe could be created.
16401682
/// \p HeaderMask Header Mask.
@@ -1799,6 +1841,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
17991841
}
18001842
}
18011843

1844+
adjustResumePhisForEVL(Plan, EVL);
1845+
18021846
for (VPRecipeBase *R : reverse(ToErase)) {
18031847
SmallVector<VPValue *> PossiblyDead(R->operands());
18041848
R->eraseFromParent();

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
153153
.Case<VPScalarCastRecipe>(
154154
[&](const VPScalarCastRecipe *S) { return VerifyEVLUse(*S, 0); })
155155
.Case<VPInstruction>([&](const VPInstruction *I) {
156+
// Used by extracting the element at last active lane.
157+
if (I->getOpcode() == Instruction::Sub)
158+
return VerifyEVLUse(*I, 0);
156159
if (I->getOpcode() != Instruction::Add) {
157160
errs() << "EVL is used as an operand in non-VPInstruction::Add\n";
158161
return false;

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
5757
; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
5858
; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5959
; IF-EVL: [[MIDDLE_BLOCK]]:
60-
; IF-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
61-
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4
62-
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1
60+
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP12]], 1
6361
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP23]]
6462
; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
6563
; IF-EVL: [[SCALAR_PH]]:
@@ -210,24 +208,20 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
210208
; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
211209
; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
212210
; IF-EVL: [[MIDDLE_BLOCK]]:
213-
; IF-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
214-
; IF-EVL-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], 4
215-
; IF-EVL-NEXT: [[TMP27:%.*]] = sub i32 [[TMP26]], 1
211+
; IF-EVL-NEXT: [[TMP27:%.*]] = sub i32 [[TMP15]], 1
216212
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP27]]
217-
; IF-EVL-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
218-
; IF-EVL-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4
219-
; IF-EVL-NEXT: [[TMP30:%.*]] = sub i32 [[TMP29]], 1
213+
; IF-EVL-NEXT: [[TMP30:%.*]] = sub i32 [[TMP15]], 1
220214
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <vscale x 4 x i32> [[TMP19]], i32 [[TMP30]]
221215
; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
222216
; IF-EVL: [[SCALAR_PH]]:
223217
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
224218
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
225-
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
219+
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT5:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
226220
; IF-EVL-NEXT: br label %[[FOR_BODY:.*]]
227221
; IF-EVL: [[FOR_BODY]]:
228222
; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
229223
; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
230-
; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
224+
; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT5]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
231225
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
232226
; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
233227
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
@@ -389,30 +383,24 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
389383
; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
390384
; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
391385
; IF-EVL: [[MIDDLE_BLOCK]]:
392-
; IF-EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
393-
; IF-EVL-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4
394-
; IF-EVL-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1
386+
; IF-EVL-NEXT: [[TMP31:%.*]] = sub i32 [[TMP18]], 1
395387
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP31]]
396-
; IF-EVL-NEXT: [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
397-
; IF-EVL-NEXT: [[TMP33:%.*]] = mul i32 [[TMP32]], 4
398-
; IF-EVL-NEXT: [[TMP34:%.*]] = sub i32 [[TMP33]], 1
388+
; IF-EVL-NEXT: [[TMP34:%.*]] = sub i32 [[TMP18]], 1
399389
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <vscale x 4 x i32> [[TMP22]], i32 [[TMP34]]
400-
; IF-EVL-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32()
401-
; IF-EVL-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 4
402-
; IF-EVL-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], 1
390+
; IF-EVL-NEXT: [[TMP37:%.*]] = sub i32 [[TMP18]], 1
403391
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[TMP23]], i32 [[TMP37]]
404392
; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
405393
; IF-EVL: [[SCALAR_PH]]:
406394
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
407395
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
408-
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
409-
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
396+
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
397+
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT11:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
410398
; IF-EVL-NEXT: br label %[[FOR_BODY:.*]]
411399
; IF-EVL: [[FOR_BODY]]:
412400
; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
413401
; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
414-
; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
415-
; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
402+
; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
403+
; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT11]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
416404
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
417405
; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
418406
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]

llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,15 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
4848
; IF-EVL-NEXT: Successor(s): middle.block
4949
; IF-EVL-EMPTY:
5050
; IF-EVL: middle.block:
51-
; IF-EVL-NEXT: EMIT vp<[[RESUME_EXTRACT:%.+]]> = extract-from-end ir<[[LD]]>, ir<1>
51+
; IF-EVL-NEXT: EMIT vp<[[LAST_ACTIVE_IDX:%.+]]> = sub vp<[[EVL]]>, ir<1>
52+
; IF-EVL-NEXT: EMIT vp<[[RESUME_EXTRACT:%.+]]> = extractelement ir<[[LD]]>, vp<[[LAST_ACTIVE_IDX]]>
5253
; IF-EVL-NEXT: EMIT branch-on-cond ir<true>
5354
; IF-EVL-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
55+
; IF-EVL-EMPTY:
56+
; IF-EVL: scalar.ph:
57+
; IF-EVL-NEXT: EMIT vp<%bc.resume.val> = resume-phi vp<[[VTC]]>, ir<0>
58+
; IF-EVL-NEXT: EMIT vp<%scalar.recur.init> = resume-phi vp<[[RESUME_EXTRACT]]>, ir<33>
59+
; IF-EVL-NEXT: Successor(s): ir-bb<for.body>
5460

5561
entry:
5662
br label %for.body

0 commit comments

Comments
 (0)