Skip to content

Commit a92577e

Browse files
committed
[LV][EVL] Support sext/zext/truncate of cast instruction with EVL-vectorization
1 parent c978d05 commit a92577e

File tree

7 files changed

+369
-18
lines changed

7 files changed

+369
-18
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 75 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
885885
case VPRecipeBase::VPWidenCallSC:
886886
case VPRecipeBase::VPWidenCanonicalIVSC:
887887
case VPRecipeBase::VPWidenCastSC:
888+
case VPRecipeBase::VPWidenCastEVLSC:
888889
case VPRecipeBase::VPWidenGEPSC:
889890
case VPRecipeBase::VPWidenSC:
890891
case VPRecipeBase::VPWidenEVLSC:
@@ -1076,6 +1077,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
10761077
R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
10771078
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
10781079
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1080+
R->getVPDefID() == VPRecipeBase::VPWidenCastEVLSC ||
10791081
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
10801082
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
10811083
}
@@ -1534,19 +1536,28 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15341536
/// Result type for the cast.
15351537
Type *ResultTy;
15361538

1537-
public:
1538-
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1539-
CastInst &UI)
1540-
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
1539+
protected:
1540+
VPWidenCastRecipe(unsigned VPDefOpcode, Instruction::CastOps Opcode,
1541+
VPValue *Op, Type *ResultTy, CastInst &UI)
1542+
: VPRecipeWithIRFlags(VPDefOpcode, Op, UI), Opcode(Opcode),
15411543
ResultTy(ResultTy) {
15421544
assert(UI.getOpcode() == Opcode &&
15431545
"opcode of underlying cast doesn't match");
15441546
}
15451547

1546-
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1547-
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
1548+
VPWidenCastRecipe(unsigned VPDefOpcode, Instruction::CastOps Opcode,
1549+
VPValue *Op, Type *ResultTy)
1550+
: VPRecipeWithIRFlags(VPDefOpcode, Op), Opcode(Opcode),
15481551
ResultTy(ResultTy) {}
15491552

1553+
public:
1554+
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1555+
CastInst &UI)
1556+
: VPWidenCastRecipe(VPDef::VPWidenCastSC, Opcode, Op, ResultTy, UI) {}
1557+
1558+
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1559+
: VPWidenCastRecipe(VPDef::VPWidenCastSC, Opcode, Op, ResultTy) {}
1560+
15501561
~VPWidenCastRecipe() override = default;
15511562

15521563
VPWidenCastRecipe *clone() override {
@@ -1557,7 +1568,15 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15571568
return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy);
15581569
}
15591570

1560-
VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
1571+
static inline bool classof(const VPRecipeBase *R) {
1572+
return R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1573+
R->getVPDefID() == VPRecipeBase::VPWidenCastEVLSC;
1574+
}
1575+
1576+
static inline bool classof(const VPUser *U) {
1577+
auto *R = dyn_cast<VPRecipeBase>(U);
1578+
return R && classof(R);
1579+
}
15611580

15621581
/// Produce widened copies of the cast.
15631582
void execute(VPTransformState &State) override;
@@ -1574,6 +1593,55 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15741593
Type *getResultType() const { return ResultTy; }
15751594
};
15761595

1596+
// A recipe for widening cast operation with vector-predication intrinsics with
1597+
/// explicit vector length (EVL).
1598+
class VPWidenCastEVLRecipe : public VPWidenCastRecipe {
1599+
using VPRecipeWithIRFlags::transferFlags;
1600+
1601+
public:
1602+
VPWidenCastEVLRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1603+
VPValue &EVL)
1604+
: VPWidenCastRecipe(VPDef::VPWidenCastEVLSC, Opcode, Op, ResultTy) {
1605+
addOperand(&EVL);
1606+
}
1607+
1608+
VPWidenCastEVLRecipe(VPWidenCastRecipe &W, VPValue &EVL)
1609+
: VPWidenCastEVLRecipe(W.getOpcode(), W.getOperand(0), W.getResultType(),
1610+
EVL) {
1611+
transferFlags(W);
1612+
}
1613+
1614+
~VPWidenCastEVLRecipe() override = default;
1615+
1616+
VPWidenCastEVLRecipe *clone() final {
1617+
llvm_unreachable("VPWidenEVLRecipe cannot be cloned");
1618+
return nullptr;
1619+
}
1620+
1621+
VP_CLASSOF_IMPL(VPDef::VPWidenCastEVLSC)
1622+
1623+
VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
1624+
const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
1625+
1626+
/// Produce a vp-intrinsic copies of the cast.
1627+
void execute(VPTransformState &State) final;
1628+
1629+
/// Returns true if the recipe only uses the first lane of operand \p Op.
1630+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1631+
assert(is_contained(operands(), Op) &&
1632+
"Op must be an operand of the recipe");
1633+
// EVL in that recipe is always the last operand, thus any use before means
1634+
// the VPValue should be vectorized.
1635+
return getEVL() == Op;
1636+
}
1637+
1638+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1639+
/// Print the recipe.
1640+
void print(raw_ostream &O, const Twine &Indent,
1641+
VPSlotTracker &SlotTracker) const final;
1642+
#endif
1643+
};
1644+
15771645
/// VPScalarCastRecipe is a recipe to create scalar cast instructions.
15781646
class VPScalarCastRecipe : public VPSingleDefRecipe {
15791647
Instruction::CastOps Opcode;

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
8888
case VPReductionSC:
8989
case VPWidenCanonicalIVSC:
9090
case VPWidenCastSC:
91+
case VPWidenCastEVLSC:
9192
case VPWidenGEPSC:
9293
case VPWidenIntOrFpInductionSC:
9394
case VPWidenLoadEVLSC:
@@ -131,6 +132,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
131132
case VPReductionSC:
132133
case VPWidenCanonicalIVSC:
133134
case VPWidenCastSC:
135+
case VPWidenCastEVLSC:
134136
case VPWidenGEPSC:
135137
case VPWidenIntOrFpInductionSC:
136138
case VPWidenPHISC:
@@ -167,6 +169,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
167169
case VPScalarIVStepsSC:
168170
case VPWidenCanonicalIVSC:
169171
case VPWidenCastSC:
172+
case VPWidenCastEVLSC:
170173
case VPWidenGEPSC:
171174
case VPWidenIntOrFpInductionSC:
172175
case VPWidenPHISC:
@@ -1419,16 +1422,56 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
14191422
State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
14201423
}
14211424

1425+
void VPWidenCastEVLRecipe::execute(VPTransformState &State) {
1426+
unsigned Opcode = getOpcode();
1427+
State.setDebugLocFrom(getDebugLoc());
1428+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
1429+
"explicit vector length.");
1430+
1431+
// TODO: add more cast instruction, eg: fptoint/inttofp/inttoptr/fptofp
1432+
if (Opcode == Instruction::SExt || Opcode == Instruction::ZExt ||
1433+
Opcode == Instruction::Trunc) {
1434+
Value *SrcVal = State.get(getOperand(0), 0);
1435+
VectorType *DsType = VectorType::get(getResultType(), State.VF);
1436+
1437+
IRBuilderBase &BuilderIR = State.Builder;
1438+
VectorBuilder Builder(BuilderIR);
1439+
Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1440+
Builder.setMask(Mask).setEVL(State.get(getEVL(), 0, /*NeedsScalar=*/true));
1441+
1442+
Value *VPInst =
1443+
Builder.createVectorInstruction(Opcode, DsType, {SrcVal}, "vp.cast");
1444+
1445+
if (VPInst) {
1446+
if (auto *VecOp = dyn_cast<CastInst>(VPInst))
1447+
VecOp->copyIRFlags(getUnderlyingInstr());
1448+
}
1449+
1450+
State.set(this, VPInst, 0);
1451+
State.addMetadata(VPInst,
1452+
dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1453+
}
1454+
}
1455+
14221456
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
14231457
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
14241458
VPSlotTracker &SlotTracker) const {
14251459
O << Indent << "WIDEN-CAST ";
14261460
printAsOperand(O, SlotTracker);
1427-
O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1461+
O << " = " << Instruction::getOpcodeName(Opcode);
14281462
printFlags(O);
14291463
printOperands(O, SlotTracker);
14301464
O << " to " << *getResultType();
14311465
}
1466+
1467+
void VPWidenCastEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1468+
VPSlotTracker &SlotTracker) const {
1469+
O << Indent << "WIDEN-CAST ";
1470+
printAsOperand(O, SlotTracker);
1471+
O << " = vp." << Instruction::getOpcodeName(getOpcode());
1472+
printFlags(O);
1473+
printOperands(O, SlotTracker);
1474+
}
14321475
#endif
14331476

14341477
/// This function adds

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,6 +1379,15 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13791379
return nullptr;
13801380
return new VPWidenEVLRecipe(*W, EVL);
13811381
})
1382+
.Case<VPWidenCastRecipe>(
1383+
[&](VPWidenCastRecipe *W) -> VPRecipeBase * {
1384+
unsigned Opcode = W->getOpcode();
1385+
if (Opcode != Instruction::SExt &&
1386+
Opcode != Instruction::ZExt &&
1387+
Opcode != Instruction::Trunc)
1388+
return nullptr;
1389+
return new VPWidenCastEVLRecipe(*W, EVL);
1390+
})
13821391
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
13831392
VPValue *NewMask = GetNewMask(Red->getCondOp());
13841393
return new VPReductionEVLRecipe(*Red, EVL, NewMask);

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ class VPDef {
349349
VPWidenCallSC,
350350
VPWidenCanonicalIVSC,
351351
VPWidenCastSC,
352+
VPWidenCastEVLSC,
352353
VPWidenGEPSC,
353354
VPWidenLoadEVLSC,
354355
VPWidenLoadSC,

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
148148
return VerifyEVLUse(
149149
*W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2);
150150
})
151+
.Case<VPWidenCastEVLRecipe>([&](const VPWidenCastEVLRecipe *C) {
152+
return VerifyEVLUse(*C, 1);
153+
})
151154
.Case<VPReductionEVLRecipe>([&](const VPReductionEVLRecipe *R) {
152155
return VerifyEVLUse(*R, 2);
153156
})

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -159,38 +159,38 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
159159
; IF-EVL-INLOOP: vector.body:
160160
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
161161
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
162-
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
162+
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
163163
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
164164
; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true)
165165
; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0
166166
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
167167
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
168168
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
169-
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
170-
; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP10]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
171-
; IF-EVL-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
169+
; IF-EVL-INLOOP-NEXT: [[VP_CAST:%.*]] = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> [[VP_OP_LOAD]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
170+
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[VP_CAST]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
171+
; IF-EVL-INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
172172
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
173173
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
174-
; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
175-
; IF-EVL-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
174+
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
175+
; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
176176
; IF-EVL-INLOOP: middle.block:
177177
; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
178178
; IF-EVL-INLOOP: scalar.ph:
179179
; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
180-
; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
180+
; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
181181
; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
182182
; IF-EVL-INLOOP: for.body:
183183
; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
184184
; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
185185
; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
186-
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
187-
; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32
186+
; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
187+
; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
188188
; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
189189
; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
190190
; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
191191
; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
192192
; IF-EVL-INLOOP: for.cond.cleanup.loopexit:
193-
; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
193+
; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
194194
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
195195
; IF-EVL-INLOOP: for.cond.cleanup:
196196
; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]

0 commit comments

Comments
 (0)