Skip to content

Commit 6dcd584

Browse files
committed
[VPlan] Introduce recipes for VP loads and stores.
Introduce new subclasses of VPWidenMemoryRecipe for VP (vector-predicated) loads and stores to address multiple TODOs from #76172 Note that the introduction of the new recipes also improves code-gen for VP gather/scatters by removing the redundant header mask. With the new approach, it is not sufficient to look at users of the widened canonical IV to find all uses of the header mask. In some cases, a widened IV is used instead of separately widening the canonical IV. To handle those cases, iterate over all recipes in the vector loop region to make sure all widened memory recipes are processed. Depends on #87411.
1 parent 2ec0e32 commit 6dcd584

File tree

8 files changed

+260
-120
lines changed

8 files changed

+260
-120
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 89 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -9417,52 +9417,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
94179417
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
94189418
}
94199419

9420-
/// Creates either vp_store or vp_scatter intrinsics calls to represent
9421-
/// predicated store/scatter.
9422-
static Instruction *
9423-
lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
9424-
Value *StoredVal, bool IsScatter, Value *Mask,
9425-
Value *EVL, const Align &Alignment) {
9426-
CallInst *Call;
9427-
if (IsScatter) {
9428-
Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9429-
Intrinsic::vp_scatter,
9430-
{StoredVal, Addr, Mask, EVL});
9431-
} else {
9432-
VectorBuilder VBuilder(Builder);
9433-
VBuilder.setEVL(EVL).setMask(Mask);
9434-
Call = cast<CallInst>(VBuilder.createVectorInstruction(
9435-
Instruction::Store, Type::getVoidTy(EVL->getContext()),
9436-
{StoredVal, Addr}));
9437-
}
9438-
Call->addParamAttr(
9439-
1, Attribute::getWithAlignment(Call->getContext(), Alignment));
9440-
return Call;
9441-
}
9442-
9443-
/// Creates either vp_load or vp_gather intrinsics calls to represent
9444-
/// predicated load/gather.
9445-
static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
9446-
VectorType *DataTy,
9447-
Value *Addr, bool IsGather,
9448-
Value *Mask, Value *EVL,
9449-
const Align &Alignment) {
9450-
CallInst *Call;
9451-
if (IsGather) {
9452-
Call =
9453-
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9454-
nullptr, "wide.masked.gather");
9455-
} else {
9456-
VectorBuilder VBuilder(Builder);
9457-
VBuilder.setEVL(EVL).setMask(Mask);
9458-
Call = cast<CallInst>(VBuilder.createVectorInstruction(
9459-
Instruction::Load, DataTy, Addr, "vp.op.load"));
9460-
}
9461-
Call->addParamAttr(
9462-
0, Attribute::getWithAlignment(Call->getContext(), Alignment));
9463-
return Call;
9464-
}
9465-
94669420
void VPWidenLoadRecipe::execute(VPTransformState &State) {
94679421
// Attempt to issue a wide load.
94689422
auto *LI = cast<LoadInst>(&Ingredient);
@@ -9491,25 +9445,7 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
94919445
State.setDebugLocFrom(getDebugLoc());
94929446
for (unsigned Part = 0; Part < State.UF; ++Part) {
94939447
Value *NewLI;
9494-
// TODO: split this into several classes for better design.
9495-
if (State.EVL) {
9496-
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9497-
"explicit vector length.");
9498-
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9499-
VPInstruction::ExplicitVectorLength &&
9500-
"EVL must be VPInstruction::ExplicitVectorLength.");
9501-
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9502-
// If EVL is not nullptr, then EVL must be a valid value set during plan
9503-
// creation, possibly default value = whole vector register length. EVL
9504-
// is created only if TTI prefers predicated vectorization, thus if EVL
9505-
// is not nullptr it also implies preference for predicated
9506-
// vectorization.
9507-
// FIXME: Support reverse loading after vp_reverse is added.
9508-
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
9509-
NewLI = lowerLoadUsingVectorIntrinsics(
9510-
Builder, DataTy, State.get(getAddr(), Part, !CreateGather),
9511-
CreateGather, MaskPart, EVL, Alignment);
9512-
} else if (CreateGather) {
9448+
if (CreateGather) {
95139449
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
95149450
Value *VectorGep = State.get(getAddr(), Part);
95159451
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
@@ -9535,6 +9471,51 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
95359471
}
95369472
}
95379473

9474+
void VPWidenVPLoadRecipe::execute(VPTransformState &State) {
9475+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9476+
"explicit vector length.");
9477+
// FIXME: Support reverse loading after vp_reverse is added.
9478+
assert(!isReverse() && "Reverse loads are not implemented yet.");
9479+
9480+
// Attempt to issue a wide load.
9481+
auto *LI = cast<LoadInst>(&Ingredient);
9482+
9483+
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9484+
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9485+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
9486+
bool CreateGather = !isConsecutive();
9487+
9488+
auto &Builder = State.Builder;
9489+
// Handle loads.
9490+
assert(LI && "Must have a load instruction");
9491+
State.setDebugLocFrom(getDebugLoc());
9492+
for (unsigned Part = 0; Part < State.UF; ++Part) {
9493+
CallInst *NewLI;
9494+
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9495+
Value *Addr = State.get(getAddr(), Part, !CreateGather);
9496+
Value *Mask =
9497+
getMask()
9498+
? State.get(getMask(), Part)
9499+
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9500+
if (CreateGather) {
9501+
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather,
9502+
{Addr, Mask, EVL}, nullptr,
9503+
"wide.masked.gather");
9504+
} else {
9505+
VectorBuilder VBuilder(Builder);
9506+
VBuilder.setEVL(EVL).setMask(Mask);
9507+
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9508+
Instruction::Load, DataTy, Addr, "vp.op.load"));
9509+
}
9510+
NewLI->addParamAttr(
9511+
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9512+
9513+
// Add metadata to the load.
9514+
State.addMetadata(NewLI, LI);
9515+
State.set(this, NewLI, Part);
9516+
}
9517+
}
9518+
95389519
void VPWidenStoreRecipe::execute(VPTransformState &State) {
95399520
auto *SI = cast<StoreInst>(&Ingredient);
95409521

@@ -9562,24 +9543,7 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
95629543
Instruction *NewSI = nullptr;
95639544
Value *StoredVal = State.get(StoredValue, Part);
95649545
// TODO: split this into several classes for better design.
9565-
if (State.EVL) {
9566-
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9567-
"explicit vector length.");
9568-
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9569-
VPInstruction::ExplicitVectorLength &&
9570-
"EVL must be VPInstruction::ExplicitVectorLength.");
9571-
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9572-
// If EVL is not nullptr, then EVL must be a valid value set during plan
9573-
// creation, possibly default value = whole vector register length. EVL
9574-
// is created only if TTI prefers predicated vectorization, thus if EVL
9575-
// is not nullptr it also implies preference for predicated
9576-
// vectorization.
9577-
// FIXME: Support reverse store after vp_reverse is added.
9578-
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
9579-
NewSI = lowerStoreUsingVectorIntrinsics(
9580-
Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal,
9581-
CreateScatter, MaskPart, EVL, Alignment);
9582-
} else if (CreateScatter) {
9546+
if (CreateScatter) {
95839547
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
95849548
Value *VectorGep = State.get(getAddr(), Part);
95859549
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
@@ -9603,6 +9567,48 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
96039567
}
96049568
}
96059569

9570+
void VPWidenVPStoreRecipe::execute(VPTransformState &State) {
9571+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9572+
"explicit vector length.");
9573+
// FIXME: Support reverse loading after vp_reverse is added.
9574+
assert(!isReverse() && "Reverse store are not implemented yet.");
9575+
9576+
auto *SI = cast<StoreInst>(&Ingredient);
9577+
9578+
VPValue *StoredValue = getStoredValue();
9579+
bool CreateScatter = !isConsecutive();
9580+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
9581+
9582+
auto &Builder = State.Builder;
9583+
State.setDebugLocFrom(getDebugLoc());
9584+
9585+
for (unsigned Part = 0; Part < State.UF; ++Part) {
9586+
CallInst *NewSI = nullptr;
9587+
Value *StoredVal = State.get(StoredValue, Part);
9588+
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9589+
// FIXME: Support reverse store after vp_reverse is added.
9590+
Value *Mask =
9591+
getMask()
9592+
? State.get(getMask(), Part)
9593+
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9594+
Value *Addr = State.get(getAddr(), Part, !CreateScatter);
9595+
if (CreateScatter) {
9596+
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9597+
Intrinsic::vp_scatter,
9598+
{StoredVal, Addr, Mask, EVL});
9599+
} else {
9600+
VectorBuilder VBuilder(Builder);
9601+
VBuilder.setEVL(EVL).setMask(Mask);
9602+
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9603+
Instruction::Store, Type::getVoidTy(EVL->getContext()),
9604+
{StoredVal, Addr}));
9605+
}
9606+
NewSI->addParamAttr(
9607+
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9608+
9609+
State.addMetadata(NewSI, SI);
9610+
}
9611+
}
96069612
// Determine how to lower the scalar epilogue, which depends on 1) optimising
96079613
// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
96089614
// predication, and 4) a TTI hook that analyses whether the loop is suitable

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 103 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -242,15 +242,6 @@ struct VPTransformState {
242242
ElementCount VF;
243243
unsigned UF;
244244

245-
/// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
246-
/// value set during plan transformation, possibly a default value = whole
247-
/// vector register length. EVL is created only if TTI prefers predicated
248-
/// vectorization, thus if EVL is not nullptr it also implies preference for
249-
/// predicated vectorization.
250-
/// TODO: this is a temporarily solution, the EVL must be explicitly used by
251-
/// the recipes and must be removed here.
252-
VPValue *EVL = nullptr;
253-
254245
/// Hold the indices to generate specific scalar instructions. Null indicates
255246
/// that all instances are to be generated, using either scalar or vector
256247
/// instructions.
@@ -2304,8 +2295,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
23042295
VPRecipeBase *clone() override = 0;
23052296

23062297
static inline bool classof(const VPRecipeBase *R) {
2307-
return R->getVPDefID() == VPDef::VPWidenLoadSC ||
2308-
R->getVPDefID() == VPDef::VPWidenStoreSC;
2298+
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
2299+
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
2300+
R->getVPDefID() == VPRecipeBase::VPWidenVPLoadSC ||
2301+
R->getVPDefID() == VPRecipeBase::VPWidenVPStoreSC;
23092302
}
23102303

23112304
static inline bool classof(const VPUser *U) {
@@ -2320,6 +2313,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
23202313
return getNumOperands() == 2;
23212314
case VPDef::VPWidenStoreSC:
23222315
return getNumOperands() == 3;
2316+
case VPDef::VPWidenVPLoadSC:
2317+
return getNumOperands() == 3;
2318+
case VPDef::VPWidenVPStoreSC:
2319+
return getNumOperands() == 4;
23232320
default:
23242321
llvm_unreachable("unhandled recipe");
23252322
}
@@ -2329,8 +2326,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
23292326
VPValue *getAddr() const {
23302327
switch (getVPDefID()) {
23312328
case VPDef::VPWidenLoadSC:
2329+
case VPDef::VPWidenVPLoadSC:
23322330
return getOperand(0);
23332331
case VPDef::VPWidenStoreSC:
2332+
case VPDef::VPWidenVPStoreSC:
23342333
return getOperand(1);
23352334
default:
23362335
llvm_unreachable("unhandled recipe");
@@ -2392,7 +2391,51 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
23922391
bool onlyFirstLaneUsed(const VPValue *Op) const override {
23932392
assert(is_contained(operands(), Op) &&
23942393
"Op must be an operand of the recipe");
2394+
// Widened, consecutive memory operations only demand the first lane of
2395+
// their address, unless the same operand is also stored. That latter can
2396+
// happen with opaque pointers.
2397+
return Op == getAddr() && isConsecutive();
2398+
}
2399+
};
23952400

2401+
/// A recipe for widening load operations with vector-predication intrinsics,
2402+
/// using the address to load from, the explicit vector length and an optional
2403+
/// mask.
2404+
struct VPWidenVPLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
2405+
VPWidenVPLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *EVL,
2406+
VPValue *Mask, bool IsConsecutive, DebugLoc DL)
2407+
: VPWidenMemoryRecipe(VPDef::VPWidenVPLoadSC, Load, {Addr, EVL},
2408+
IsConsecutive, false, DL),
2409+
VPValue(this, &Load) {
2410+
setMask(Mask);
2411+
}
2412+
2413+
VPRecipeBase *clone() override {
2414+
return new VPWidenVPLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
2415+
getEVL(), getMask(), isConsecutive(),
2416+
getDebugLoc());
2417+
}
2418+
2419+
VP_CLASSOF_IMPL(VPDef::VPWidenVPLoadSC)
2420+
2421+
/// Return the EVL operand.
2422+
VPValue *getEVL() const { return getOperand(1); }
2423+
2424+
/// Generate the wide load/store.
2425+
void execute(VPTransformState &State) override;
2426+
2427+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2428+
/// Print the recipe.
2429+
void print(raw_ostream &O, const Twine &Indent,
2430+
VPSlotTracker &SlotTracker) const override;
2431+
#endif
2432+
2433+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2434+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2435+
assert(is_contained(operands(), Op) &&
2436+
"Op must be an operand of the recipe");
2437+
if (Op == getEVL())
2438+
return true;
23962439
// Widened, consecutive loads operations only demand the first lane of
23972440
// their address.
23982441
return Op == getAddr() && isConsecutive();
@@ -2439,6 +2482,55 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
24392482
return Op == getAddr() && isConsecutive() && Op != getStoredValue();
24402483
}
24412484
};
2485+
2486+
/// A recipe for widening store operations with vector-predication intrinsics,
2487+
/// using the value to store, the address to store to , the explicit vector
2488+
/// length and an optional mask.
2489+
struct VPWidenVPStoreRecipe final : public VPWidenMemoryRecipe {
2490+
VPWidenVPStoreRecipe(StoreInst &Store, VPValue *StoredVal, VPValue *Addr,
2491+
VPValue *EVL, VPValue *Mask, bool IsConsecutive,
2492+
DebugLoc DL)
2493+
: VPWidenMemoryRecipe(VPDef::VPWidenVPStoreSC, Store,
2494+
{StoredVal, Addr, EVL}, IsConsecutive, false, DL) {
2495+
setMask(Mask);
2496+
}
2497+
2498+
VPRecipeBase *clone() override {
2499+
return new VPWidenVPStoreRecipe(cast<StoreInst>(Ingredient),
2500+
getStoredValue(), getAddr(), getEVL(),
2501+
getMask(), isConsecutive(), getDebugLoc());
2502+
}
2503+
2504+
VP_CLASSOF_IMPL(VPDef::VPWidenVPStoreSC)
2505+
2506+
/// Return the address accessed by this recipe.
2507+
VPValue *getStoredValue() const { return getOperand(0); }
2508+
2509+
/// Return the EVL operand.
2510+
VPValue *getEVL() const { return getOperand(2); }
2511+
2512+
/// Generate the wide load/store.
2513+
void execute(VPTransformState &State) override;
2514+
2515+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2516+
/// Print the recipe.
2517+
void print(raw_ostream &O, const Twine &Indent,
2518+
VPSlotTracker &SlotTracker) const override;
2519+
#endif
2520+
2521+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2522+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2523+
assert(is_contained(operands(), Op) &&
2524+
"Op must be an operand of the recipe");
2525+
if (Op == getEVL())
2526+
return true;
2527+
// Widened, consecutive memory operations only demand the first lane of
2528+
// their address, unless the same operand is also stored. That latter can
2529+
// happen with opaque pointers.
2530+
return Op == getAddr() && isConsecutive() && Op != getStoredValue();
2531+
}
2532+
};
2533+
24422534
/// Recipe to expand a SCEV expression.
24432535
class VPExpandSCEVRecipe : public VPSingleDefRecipe {
24442536
const SCEV *Expr;

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
109109
}
110110

111111
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
112-
assert(isa<VPWidenLoadRecipe>(R) &&
112+
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenVPLoadRecipe>(R)) &&
113113
"Store recipes should not define any values");
114114
return cast<LoadInst>(&R->getIngredient())->getType();
115115
}

0 commit comments

Comments
 (0)