Skip to content

Commit bcfc9f4

Browse files
authored
[SLP][REVEC] VectorValuesAndScales should be supported by REVEC. (#135762)
We should align REVEC with the SLP algorithm as closely as possible. For example, by applying REVEC-specific handling when calling IRBuilder's Create methods, performing cost analysis via TTI, and expanding shuffle masks using transformScalarShuffleIndicesToVector. reference commit: 3b18d47
1 parent 9ab2dea commit bcfc9f4

File tree

3 files changed

+144
-77
lines changed

3 files changed

+144
-77
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 64 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -22144,53 +22144,16 @@ class HorizontalReduction {
2214422144
}
2214522145

2214622146
Type *ScalarTy = VL.front()->getType();
22147-
if (isa<FixedVectorType>(ScalarTy)) {
22148-
assert(SLPReVec && "FixedVectorType is not expected.");
22149-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22150-
Value *ReducedSubTree = PoisonValue::get(
22151-
getWidenedType(ScalarTy->getScalarType(), ScalarTyNumElements));
22152-
for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
22153-
// Do reduction for each lane.
22154-
// e.g., do reduce add for
22155-
// VL[0] = <4 x Ty> <a, b, c, d>
22156-
// VL[1] = <4 x Ty> <e, f, g, h>
22157-
// Lane[0] = <2 x Ty> <a, e>
22158-
// Lane[1] = <2 x Ty> <b, f>
22159-
// Lane[2] = <2 x Ty> <c, g>
22160-
// Lane[3] = <2 x Ty> <d, h>
22161-
// result[0] = reduce add Lane[0]
22162-
// result[1] = reduce add Lane[1]
22163-
// result[2] = reduce add Lane[2]
22164-
// result[3] = reduce add Lane[3]
22165-
SmallVector<int, 16> Mask =
22166-
createStrideMask(I, ScalarTyNumElements, VL.size());
22167-
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
22168-
Value *Val =
22169-
createSingleOp(Builder, *TTI, Lane,
22170-
OptReusedScalars && SameScaleFactor
22171-
? SameValuesCounter.front().second
22172-
: 1,
22173-
Lane->getType()->getScalarType() !=
22174-
VL.front()->getType()->getScalarType()
22175-
? V.isSignedMinBitwidthRootNode()
22176-
: true,
22177-
RdxRootInst->getType());
22178-
ReducedSubTree =
22179-
Builder.CreateInsertElement(ReducedSubTree, Val, I);
22180-
}
22181-
VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
22182-
} else {
22183-
Type *VecTy = VectorizedRoot->getType();
22184-
Type *RedScalarTy = VecTy->getScalarType();
22185-
VectorValuesAndScales.emplace_back(
22186-
VectorizedRoot,
22187-
OptReusedScalars && SameScaleFactor
22188-
? SameValuesCounter.front().second
22189-
: 1,
22190-
RedScalarTy != ScalarTy->getScalarType()
22191-
? V.isSignedMinBitwidthRootNode()
22192-
: true);
22193-
}
22147+
Type *VecTy = VectorizedRoot->getType();
22148+
Type *RedScalarTy = VecTy->getScalarType();
22149+
VectorValuesAndScales.emplace_back(
22150+
VectorizedRoot,
22151+
OptReusedScalars && SameScaleFactor
22152+
? SameValuesCounter.front().second
22153+
: 1,
22154+
RedScalarTy != ScalarTy->getScalarType()
22155+
? V.isSignedMinBitwidthRootNode()
22156+
: true);
2219422157

2219522158
// Count vectorized reduced values to exclude them from final reduction.
2219622159
for (Value *RdxVal : VL) {
@@ -22363,9 +22326,35 @@ class HorizontalReduction {
2236322326
Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
2236422327
Value *Vec, unsigned Scale, bool IsSigned,
2236522328
Type *DestTy) {
22366-
Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
22367-
if (Rdx->getType() != DestTy->getScalarType())
22368-
Rdx = Builder.CreateIntCast(Rdx, DestTy->getScalarType(), IsSigned);
22329+
Value *Rdx;
22330+
if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
22331+
unsigned DestTyNumElements = getNumElements(VecTy);
22332+
unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
22333+
Rdx = PoisonValue::get(
22334+
getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
22335+
for (unsigned I : seq<unsigned>(DestTyNumElements)) {
22336+
// Do reduction for each lane.
22337+
// e.g., do reduce add for
22338+
// VL[0] = <4 x Ty> <a, b, c, d>
22339+
// VL[1] = <4 x Ty> <e, f, g, h>
22340+
// Lane[0] = <2 x Ty> <a, e>
22341+
// Lane[1] = <2 x Ty> <b, f>
22342+
// Lane[2] = <2 x Ty> <c, g>
22343+
// Lane[3] = <2 x Ty> <d, h>
22344+
// result[0] = reduce add Lane[0]
22345+
// result[1] = reduce add Lane[1]
22346+
// result[2] = reduce add Lane[2]
22347+
// result[3] = reduce add Lane[3]
22348+
SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
22349+
Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
22350+
Rdx = Builder.CreateInsertElement(
22351+
Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
22352+
}
22353+
} else {
22354+
Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
22355+
}
22356+
if (Rdx->getType() != DestTy)
22357+
Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
2236922358
// Improved analysis for add/fadd/xor reductions with same scale
2237022359
// factor for all operands of reductions. We can emit scalar ops for
2237122360
// them instead.
@@ -22432,30 +22421,32 @@ class HorizontalReduction {
2243222421
case RecurKind::FMul: {
2243322422
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
2243422423
if (!AllConsts) {
22435-
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
22436-
assert(SLPReVec && "FixedVectorType is not expected.");
22437-
unsigned ScalarTyNumElements = VecTy->getNumElements();
22438-
for (unsigned I : seq<unsigned>(ReducedVals.size())) {
22439-
VectorCost += TTI->getShuffleCost(
22440-
TTI::SK_PermuteSingleSrc, VectorTy,
22441-
createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
22442-
VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
22443-
CostKind);
22444-
}
22445-
VectorCost += TTI->getScalarizationOverhead(
22446-
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
22447-
/*Extract*/ false, TTI::TCK_RecipThroughput);
22448-
} else if (DoesRequireReductionOp) {
22449-
Type *RedTy = VectorTy->getElementType();
22450-
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
22451-
std::make_pair(RedTy, true));
22452-
if (RType == RedTy) {
22453-
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
22454-
FMF, CostKind);
22424+
if (DoesRequireReductionOp) {
22425+
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
22426+
assert(SLPReVec && "FixedVectorType is not expected.");
22427+
unsigned ScalarTyNumElements = VecTy->getNumElements();
22428+
for (unsigned I : seq<unsigned>(ReducedVals.size())) {
22429+
VectorCost += TTI->getShuffleCost(
22430+
TTI::SK_PermuteSingleSrc, VectorTy,
22431+
createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
22432+
VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
22433+
FMF, CostKind);
22434+
}
22435+
VectorCost += TTI->getScalarizationOverhead(
22436+
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
22437+
/*Extract*/ false, TTI::TCK_RecipThroughput);
2245522438
} else {
22456-
VectorCost = TTI->getExtendedReductionCost(
22457-
RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
22458-
FMF, CostKind);
22439+
Type *RedTy = VectorTy->getElementType();
22440+
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
22441+
std::make_pair(RedTy, true));
22442+
if (RType == RedTy) {
22443+
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
22444+
FMF, CostKind);
22445+
} else {
22446+
VectorCost = TTI->getExtendedReductionCost(
22447+
RdxOpcode, !IsSigned, RedTy,
22448+
getWidenedType(RType, ReduxWidth), FMF, CostKind);
22449+
}
2245922450
}
2246022451
} else {
2246122452
Type *RedTy = VectorTy->getElementType();

llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,16 @@ define void @e(<4 x i16> %0) {
4444
; THRESH-NEXT: [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP12]], [[TMP7]]
4545
; THRESH-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
4646
; THRESH-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
47+
; THRESH-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i64 0
4748
; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
4849
; THRESH-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
50+
; THRESH-NEXT: [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP17]], i64 1
4951
; THRESH-NEXT: [[TMP18:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
5052
; THRESH-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
53+
; THRESH-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP19]], i64 2
5154
; THRESH-NEXT: [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
5255
; THRESH-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]])
53-
; THRESH-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i32 0
54-
; THRESH-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP17]], i32 1
55-
; THRESH-NEXT: [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP19]], i32 2
56-
; THRESH-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP21]], i32 3
56+
; THRESH-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP21]], i64 3
5757
; THRESH-NEXT: [[TMP26]] = zext <4 x i1> [[TMP25]] to <4 x i32>
5858
; THRESH-NEXT: br label [[VECTOR_BODY]]
5959
;
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 < %s | FileCheck %s
3+
4+
define <4 x i16> @test() {
5+
; CHECK-LABEL: define <4 x i16> @test() {
6+
; CHECK-NEXT: [[ENTRY:.*:]]
7+
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
8+
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4)
9+
; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[TMP1]], [[TMP1]]
10+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
11+
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
12+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
13+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4)
14+
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 8)
15+
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP7]], <4 x i16> zeroinitializer, i64 12)
16+
; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]]
17+
; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]]
18+
; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]]
19+
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
20+
; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
21+
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
22+
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
23+
; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]])
24+
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
25+
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
26+
; CHECK-NEXT: [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]])
27+
; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
28+
; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
29+
; CHECK-NEXT: [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]])
30+
; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
31+
; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> [[TMP11]], [[TMP9]]
32+
; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
33+
; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
34+
; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
35+
; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
36+
; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
37+
; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
38+
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
39+
; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
40+
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
41+
; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
42+
; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
43+
; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
44+
; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
45+
; CHECK-NEXT: [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]]
46+
; CHECK-NEXT: ret <4 x i16> [[OP_RDX11]]
47+
;
48+
entry:
49+
%subi = add <4 x i16> zeroinitializer, zeroinitializer
50+
%sub40.i = add <4 x i16> %subi, zeroinitializer
51+
%sub41.i = add <4 x i16> %subi, zeroinitializer
52+
%sub42.i = add <4 x i16> %subi, zeroinitializer
53+
%sub43.i = add <4 x i16> %subi, zeroinitializer
54+
%sub44.i = add <4 x i16> %subi, zeroinitializer
55+
%sub45.i = add <4 x i16> %subi, zeroinitializer
56+
%sub46.i = add <4 x i16> zeroinitializer, zeroinitializer
57+
%sub47.i = add <4 x i16> zeroinitializer, zeroinitializer
58+
%sub48.i = add <4 x i16> zeroinitializer, zeroinitializer
59+
%sub49.i = add <4 x i16> zeroinitializer, zeroinitializer
60+
%or40.i = or <4 x i16> %sub40.i, %sub41.i
61+
%or41.i = or <4 x i16> %or40.i, %sub42.i
62+
%or42.i = or <4 x i16> %or41.i, %sub43.i
63+
%or43.i = or <4 x i16> %or42.i, %sub44.i
64+
%or44.i = or <4 x i16> %or43.i, %sub45.i
65+
%or45.i = or <4 x i16> %or44.i, %sub46.i
66+
%or46.i = or <4 x i16> %or45.i, %sub47.i
67+
%or47.i = or <4 x i16> %or46.i, %sub48.i
68+
%or48.i = or <4 x i16> %or47.i, %sub49.i
69+
%or50.i = or <4 x i16> %or48.i, %subi
70+
%subii = add <4 x i16> zeroinitializer, zeroinitializer
71+
%subi16.i = add <4 x i16> %subii, zeroinitializer
72+
%subi17.i = add <4 x i16> %subii, zeroinitializer
73+
%0 = or <4 x i16> %subi16.i, %subi17.i
74+
%1 = or <4 x i16> %0, %or50.i
75+
ret <4 x i16> %1
76+
}

0 commit comments

Comments
 (0)