Skip to content

Commit 771c587

Browse files
author
git apple-llvm automerger
committed
Merge commit '70535f5e609f' from llvm.org/main into next
2 parents e995b1f + 70535f5 commit 771c587

File tree

10 files changed

+442
-523
lines changed

10 files changed

+442
-523
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -641,10 +641,6 @@ class InnerLoopVectorizer {
641641
/// the block that was created for it.
642642
void sinkScalarOperands(Instruction *PredInst);
643643

644-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
645-
/// represented as.
646-
void truncateToMinimalBitwidths(VPTransformState &State);
647-
648644
/// Returns (and creates if needed) the trip count of the widened loop.
649645
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
650646

@@ -3429,151 +3425,8 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) {
34293425
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
34303426
}
34313427

3432-
void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3433-
// For every instruction `I` in MinBWs, truncate the operands, create a
3434-
// truncated version of `I` and reextend its result. InstCombine runs
3435-
// later and will remove any ext/trunc pairs.
3436-
SmallPtrSet<Value *, 4> Erased;
3437-
for (const auto &KV : Cost->getMinimalBitwidths()) {
3438-
// If the value wasn't vectorized, we must maintain the original scalar
3439-
// type. The absence of the value from State indicates that it
3440-
// wasn't vectorized.
3441-
// FIXME: Should not rely on getVPValue at this point.
3442-
VPValue *Def = State.Plan->getVPValue(KV.first, true);
3443-
if (!State.hasAnyVectorValue(Def))
3444-
continue;
3445-
// If the instruction is defined outside the loop, only update the first
3446-
// part; the first part will be re-used for all other parts.
3447-
unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1;
3448-
for (unsigned Part = 0; Part < UFToUse; ++Part) {
3449-
Value *I = State.get(Def, Part);
3450-
if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3451-
continue;
3452-
Type *OriginalTy = I->getType();
3453-
Type *ScalarTruncatedTy =
3454-
IntegerType::get(OriginalTy->getContext(), KV.second);
3455-
auto *TruncatedTy = VectorType::get(
3456-
ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3457-
if (TruncatedTy == OriginalTy)
3458-
continue;
3459-
3460-
IRBuilder<> B(cast<Instruction>(I));
3461-
auto ShrinkOperand = [&](Value *V) -> Value * {
3462-
if (auto *ZI = dyn_cast<ZExtInst>(V))
3463-
if (ZI->getSrcTy() == TruncatedTy)
3464-
return ZI->getOperand(0);
3465-
return B.CreateZExtOrTrunc(V, TruncatedTy);
3466-
};
3467-
3468-
// The actual instruction modification depends on the instruction type,
3469-
// unfortunately.
3470-
Value *NewI = nullptr;
3471-
if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3472-
Value *Op0 = ShrinkOperand(BO->getOperand(0));
3473-
Value *Op1 = ShrinkOperand(BO->getOperand(1));
3474-
NewI = B.CreateBinOp(BO->getOpcode(), Op0, Op1);
3475-
3476-
// Any wrapping introduced by shrinking this operation shouldn't be
3477-
// considered undefined behavior. So, we can't unconditionally copy
3478-
// arithmetic wrapping flags to NewI.
3479-
cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3480-
} else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3481-
Value *Op0 = ShrinkOperand(BO->getOperand(0));
3482-
Value *Op1 = ShrinkOperand(BO->getOperand(1));
3483-
NewI = B.CreateICmp(CI->getPredicate(), Op0, Op1);
3484-
} else if (auto *SI = dyn_cast<SelectInst>(I)) {
3485-
Value *TV = ShrinkOperand(SI->getTrueValue());
3486-
Value *FV = ShrinkOperand(SI->getFalseValue());
3487-
NewI = B.CreateSelect(SI->getCondition(), TV, FV);
3488-
} else if (auto *CI = dyn_cast<CastInst>(I)) {
3489-
switch (CI->getOpcode()) {
3490-
default:
3491-
llvm_unreachable("Unhandled cast!");
3492-
case Instruction::Trunc:
3493-
NewI = ShrinkOperand(CI->getOperand(0));
3494-
break;
3495-
case Instruction::SExt:
3496-
NewI = B.CreateSExtOrTrunc(
3497-
CI->getOperand(0),
3498-
smallestIntegerVectorType(OriginalTy, TruncatedTy));
3499-
break;
3500-
case Instruction::ZExt:
3501-
NewI = B.CreateZExtOrTrunc(
3502-
CI->getOperand(0),
3503-
smallestIntegerVectorType(OriginalTy, TruncatedTy));
3504-
break;
3505-
}
3506-
} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3507-
auto Elements0 =
3508-
cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3509-
auto *O0 = B.CreateZExtOrTrunc(
3510-
SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3511-
auto Elements1 =
3512-
cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3513-
auto *O1 = B.CreateZExtOrTrunc(
3514-
SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3515-
3516-
NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3517-
} else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3518-
// Don't do anything with the operands, just extend the result.
3519-
continue;
3520-
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3521-
auto Elements =
3522-
cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3523-
auto *O0 = B.CreateZExtOrTrunc(
3524-
IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3525-
auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3526-
NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3527-
} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3528-
auto Elements =
3529-
cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3530-
auto *O0 = B.CreateZExtOrTrunc(
3531-
EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3532-
NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3533-
} else {
3534-
// If we don't know what to do, be conservative and don't do anything.
3535-
continue;
3536-
}
3537-
3538-
// Lastly, extend the result.
3539-
NewI->takeName(cast<Instruction>(I));
3540-
Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3541-
I->replaceAllUsesWith(Res);
3542-
cast<Instruction>(I)->eraseFromParent();
3543-
Erased.insert(I);
3544-
State.reset(Def, Res, Part);
3545-
}
3546-
}
3547-
3548-
// We'll have created a bunch of ZExts that are now parentless. Clean up.
3549-
for (const auto &KV : Cost->getMinimalBitwidths()) {
3550-
// If the value wasn't vectorized, we must maintain the original scalar
3551-
// type. The absence of the value from State indicates that it
3552-
// wasn't vectorized.
3553-
// FIXME: Should not rely on getVPValue at this point.
3554-
VPValue *Def = State.Plan->getVPValue(KV.first, true);
3555-
if (!State.hasAnyVectorValue(Def))
3556-
continue;
3557-
unsigned UFToUse = OrigLoop->contains(KV.first) ? UF : 1;
3558-
for (unsigned Part = 0; Part < UFToUse; ++Part) {
3559-
Value *I = State.get(Def, Part);
3560-
ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3561-
if (Inst && Inst->use_empty()) {
3562-
Value *NewI = Inst->getOperand(0);
3563-
Inst->eraseFromParent();
3564-
State.reset(Def, NewI, Part);
3565-
}
3566-
}
3567-
}
3568-
}
3569-
35703428
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
35713429
VPlan &Plan) {
3572-
// Insert truncates and extends for any truncated instructions as hints to
3573-
// InstCombine.
3574-
if (VF.isVector())
3575-
truncateToMinimalBitwidths(State);
3576-
35773430
// Fix widened non-induction PHIs by setting up the PHI operands.
35783431
if (EnableVPlanNativePath)
35793432
fixNonInductionPHIs(Plan, State);
@@ -8741,6 +8594,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
87418594
VFRange SubRange = {VF, MaxVFTimes2};
87428595
if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
87438596
// Now optimize the initial VPlan.
8597+
if (!Plan->hasVF(ElementCount::getFixed(1)))
8598+
VPlanTransforms::truncateToMinimalBitwidths(
8599+
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
87448600
VPlanTransforms::optimize(*Plan, *PSE.getSE());
87458601
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
87468602
VPlans.push_back(std::move(Plan));

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,6 @@ struct VPTransformState {
275275
I->second[Part];
276276
}
277277

278-
bool hasAnyVectorValue(VPValue *Def) const {
279-
return Data.PerPartOutput.contains(Def);
280-
}
281-
282278
bool hasScalarValue(VPValue *Def, VPIteration Instance) {
283279
auto I = Data.PerPartScalars.find(Def);
284280
if (I == Data.PerPartScalars.end())

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -774,9 +774,14 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
774774
/// Vectorize casts.
775775
assert(State.VF.isVector() && "Not vectorizing?");
776776
Type *DestTy = VectorType::get(getResultType(), State.VF);
777-
777+
VPValue *Op = getOperand(0);
778778
for (unsigned Part = 0; Part < State.UF; ++Part) {
779-
Value *A = State.get(getOperand(0), Part);
779+
if (Part > 0 && Op->isLiveIn()) {
780+
// FIXME: Remove once explicit unrolling is implemented using VPlan.
781+
State.set(this, State.get(this, 0), Part);
782+
continue;
783+
}
784+
Value *A = State.get(Op, Part);
780785
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
781786
State.set(this, Cast, Part);
782787
State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,100 @@ static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) {
870870
}
871871
}
872872

873+
void VPlanTransforms::truncateToMinimalBitwidths(
874+
VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs,
875+
LLVMContext &Ctx) {
876+
#ifndef NDEBUG
877+
// Count the processed recipes and cross check the count later with MinBWs
878+
// size, to make sure all entries in MinBWs have been handled.
879+
unsigned NumProcessedRecipes = 0;
880+
#endif
881+
// Keep track of created truncates, so they can be re-used. Note that we
882+
// cannot use RAUW after creating a new truncate, as this would could make
883+
// other uses have different types for their operands, making them invalidly
884+
// typed.
885+
DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
886+
VPTypeAnalysis TypeInfo(Ctx);
887+
VPBasicBlock *PH = Plan.getEntry();
888+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
889+
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
890+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
891+
if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
892+
VPWidenSelectRecipe>(&R))
893+
continue;
894+
895+
VPValue *ResultVPV = R.getVPSingleValue();
896+
auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
897+
unsigned NewResSizeInBits = MinBWs.lookup(UI);
898+
if (!NewResSizeInBits)
899+
continue;
900+
901+
#ifndef NDEBUG
902+
NumProcessedRecipes++;
903+
#endif
904+
// If the value wasn't vectorized, we must maintain the original scalar
905+
// type. Skip those here, after incrementing NumProcessedRecipes. Also
906+
// skip casts which do not need to be handled explicitly here, as
907+
// redundant casts will be removed during recipe simplification.
908+
if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))
909+
continue;
910+
911+
Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
912+
unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
913+
assert(OldResTy->isIntegerTy() && "only integer types supported");
914+
assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
915+
916+
auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
917+
918+
// Shrink operands by introducing truncates as needed.
919+
unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
920+
for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
921+
auto *Op = R.getOperand(Idx);
922+
unsigned OpSizeInBits =
923+
TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
924+
if (OpSizeInBits == NewResSizeInBits)
925+
continue;
926+
assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
927+
auto [ProcessedIter, IterIsEmpty] =
928+
ProcessedTruncs.insert({Op, nullptr});
929+
VPWidenCastRecipe *NewOp =
930+
IterIsEmpty
931+
? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
932+
: ProcessedIter->second;
933+
R.setOperand(Idx, NewOp);
934+
if (!IterIsEmpty)
935+
continue;
936+
ProcessedIter->second = NewOp;
937+
if (!Op->isLiveIn()) {
938+
NewOp->insertBefore(&R);
939+
} else {
940+
PH->appendRecipe(NewOp);
941+
#ifndef NDEBUG
942+
auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());
943+
bool IsContained = MinBWs.contains(OpInst);
944+
NumProcessedRecipes += IsContained;
945+
#endif
946+
}
947+
}
948+
949+
// Any wrapping introduced by shrinking this operation shouldn't be
950+
// considered undefined behavior. So, we can't unconditionally copy
951+
// arithmetic wrapping flags to VPW.
952+
if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
953+
VPW->dropPoisonGeneratingFlags();
954+
955+
// Extend result to original width.
956+
auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
957+
Ext->insertAfter(&R);
958+
ResultVPV->replaceAllUsesWith(Ext);
959+
Ext->setOperand(0, ResultVPV);
960+
}
961+
}
962+
963+
assert(MinBWs.size() == NumProcessedRecipes &&
964+
"some entries in MinBWs haven't been processed");
965+
}
966+
873967
void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
874968
removeRedundantCanonicalIVs(Plan);
875969
removeRedundantInductionCasts(Plan);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@ struct VPlanTransforms {
7979
bool UseActiveLaneMaskForControlFlow,
8080
bool DataAndControlFlowWithoutRuntimeCheck);
8181

82+
/// Insert truncates and extends for any truncated recipe. Redundant casts
83+
/// will be folded later.
84+
static void
85+
truncateToMinimalBitwidths(VPlan &Plan,
86+
const MapVector<Instruction *, uint64_t> &MinBWs,
87+
LLVMContext &Ctx);
88+
8289
private:
8390
/// Remove redundant VPBasicBlocks by merging them into their predecessor if
8491
/// the predecessor has a single successor.

0 commit comments

Comments
 (0)