Skip to content

Commit f5ee07a

Browse files
[SLP]Improve instruction reordering mode detection.
The "instruction" reordering mode should be selected only if there are compatible instructions in other operands, which can be reordered. Otherwise, better to select splat reordering mode. Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12383340.00 12383324.00 -0.0% Some 4x operations get replaced by 8x. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #97485
1 parent ae7ab04 commit f5ee07a

File tree

3 files changed

+34
-22
lines changed

3 files changed

+34
-22
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2213,6 +2213,27 @@ class BoUpSLP {
22132213
return getNumLanes() == 2 || Cnt > 1;
22142214
}
22152215

2216+
/// Checks if there is at least single compatible operand in lanes other
2217+
/// than \p Lane, compatible with the operand \p Op.
2218+
bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2219+
bool OpAPO = getData(OpIdx, Lane).APO;
2220+
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2221+
if (Ln == Lane)
2222+
continue;
2223+
if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2224+
const OperandData &Data = getData(OpI, Ln);
2225+
if (Data.APO != OpAPO || Data.IsUsed)
2226+
return true;
2227+
Value *OpILn = getValue(OpI, Ln);
2228+
return (L && L->isLoopInvariant(OpILn)) ||
2229+
(getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2230+
Op->getParent() == cast<Instruction>(OpILn)->getParent());
2231+
}))
2232+
return true;
2233+
}
2234+
return false;
2235+
}
2236+
22162237
public:
22172238
/// Initialize with all the operands of the instruction vector \p RootVL.
22182239
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
@@ -2268,14 +2289,14 @@ class BoUpSLP {
22682289
// side.
22692290
if (isa<LoadInst>(OpLane0))
22702291
ReorderingModes[OpIdx] = ReorderingMode::Load;
2271-
else if (isa<Instruction>(OpLane0)) {
2292+
else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
22722293
// Check if OpLane0 should be broadcast.
2273-
if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2294+
if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2295+
!canBeVectorized(OpILane0, OpIdx, FirstLane))
22742296
ReorderingModes[OpIdx] = ReorderingMode::Splat;
22752297
else
22762298
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2277-
}
2278-
else if (isa<Constant>(OpLane0))
2299+
} else if (isa<Constant>(OpLane0))
22792300
ReorderingModes[OpIdx] = ReorderingMode::Constant;
22802301
else if (isa<Argument>(OpLane0))
22812302
// Our best hope is a Splat. It may save some cost in some cases.

llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ define void @foo() local_unnamed_addr {
1212
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 0), align 4
1313
; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0
1414
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4
15-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
16-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
17-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
18-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
19-
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]]
15+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 undef, i32 poison, i32 poison>, i32 [[TMP0]], i32 0
16+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
17+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
18+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 undef, i32 poison, i32 undef, i32 undef>, i32 [[ADD277]], i32 1
19+
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]]
2020
; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], <i32 6, i32 6, i32 6, i32 6>
2121
; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4
2222
; CHECK-NEXT: unreachable

llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,18 @@ define void @test() {
1212
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
1313
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
1414
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP1]]
15-
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4
1615
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
17-
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
18-
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
19-
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX11]], align 4
2016
; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
2117
; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP5]]
22-
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
23-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
24-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP2]], i32 1
25-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
26-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
27-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP6]], i32 3
28-
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
29-
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
30-
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
18+
; CHECK-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
19+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
20+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP0]], i32 0
3121
; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP11]], [[TMP14]]
3222
; CHECK-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX6]], align 4
3323
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
3424
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
3525
; CHECK-NEXT: [[TMP16]] = load float, ptr [[ARRAYIDX41]], align 4
26+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP14]], i32 3
3627
; CHECK-NEXT: [[MUL45:%.*]] = fmul fast float [[TMP16]], [[TMP6]]
3728
; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
3829
; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 31990

0 commit comments

Comments
 (0)