Skip to content

Commit fb86b3d

Browse files
committed
[SLP]Change the insertion point for outside-block-used nodes and prevec phi operand gathers
Need to set the insertion point for (non-schedulable) vector node after the last instruction in the node to avoid def-use breakage. But it also causes miscompilation with gather/buildvector operands of the phi nodes, used in the same phi only in the block. These nodes supposed to be inserted at the end of the block and after changing the insertion point for the non-schedulable vec block, it also may break def-use dependencies. Need to prevector such nodes, to emit them as early as possible, so the vectorized nodes are inserted before these nodes. Fixes #139728 Recommit after revert 60fb921 Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: RKSimon Pull Request: #139917
1 parent 1b41599 commit fb86b3d

File tree

7 files changed

+88
-26
lines changed

7 files changed

+88
-26
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16142,16 +16142,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
1614216142
[](Value *V) {
1614316143
return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
1614416144
})) ||
16145-
all_of(E->Scalars,
16146-
[](Value *V) {
16147-
return isa<PoisonValue>(V) ||
16148-
(!isVectorLikeInstWithConstOps(V) &&
16149-
isUsedOutsideBlock(V));
16150-
}) ||
16151-
(E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
16152-
return isa<ExtractElementInst, UndefValue>(V) ||
16153-
areAllOperandsNonInsts(V);
16154-
})))
16145+
all_of(E->Scalars, [](Value *V) {
16146+
return isa<PoisonValue>(V) ||
16147+
(!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
16148+
}))
1615516149
Res = FindLastInst();
1615616150
else
1615716151
Res = FindFirstInst();
@@ -17617,6 +17611,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1761717611
if (VecTy)
1761817612
ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
1761917613
}
17614+
if (E->VectorizedValue)
17615+
return E->VectorizedValue;
1762017616
auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
1762117617
if (E->isGather()) {
1762217618
// Set insert point for non-reduction initial nodes.
@@ -17799,6 +17795,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1779917795
Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
1780017796
NewPhi->addIncoming(VecOp, IBB);
1780117797
TreeEntry *OpTE = getOperandEntry(E, I);
17798+
assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
1780217799
OpTE->VectorizedValue = VecOp;
1780317800
continue;
1780417801
}
@@ -18696,6 +18693,22 @@ Value *BoUpSLP::vectorizeTree(
1869618693
else
1869718694
Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
1869818695

18696+
// Vectorize gather operands of the nodes with the external uses only.
18697+
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18698+
if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
18699+
TE->UserTreeIndex.UserTE->hasState() &&
18700+
TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
18701+
(TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
18702+
TE->UserTreeIndex.UserTE->isAltShuffle()) &&
18703+
all_of(TE->UserTreeIndex.UserTE->Scalars,
18704+
[](Value *V) { return isUsedOutsideBlock(V); })) {
18705+
Instruction &LastInst =
18706+
getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
18707+
Builder.SetInsertPoint(&LastInst);
18708+
Builder.SetCurrentDebugLocation(LastInst.getDebugLoc());
18709+
(void)vectorizeTree(TE.get());
18710+
}
18711+
}
1869918712
// Emit gathered loads first to emit better code for the users of those
1870018713
// gathered loads.
1870118714
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,37 +10,38 @@ target triple = "x86_64-apple-macosx10.8.0"
1010
define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) #0 align 2 {
1111
; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(
1212
; CHECK-NEXT: entry:
13-
; CHECK-NEXT: br i1 %arg, label [[RETURN:%.*]], label [[IF_END:%.*]]
13+
; CHECK-NEXT: br i1 [[ARG:%.*]], label [[RETURN:%.*]], label [[IF_END:%.*]]
1414
; CHECK: if.end:
1515
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1616
; CHECK: for.body:
17-
; CHECK-NEXT: br i1 %arg, label [[IF_THEN17_1:%.*]], label [[IF_END22_1:%.*]]
17+
; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN17_1:%.*]], label [[IF_END22_1:%.*]]
1818
; CHECK: for.end36:
1919
; CHECK-NEXT: br label [[FOR_BODY144:%.*]]
2020
; CHECK: for.body144:
21-
; CHECK-NEXT: br i1 %arg, label [[FOR_END227:%.*]], label [[FOR_BODY144]]
21+
; CHECK-NEXT: br i1 [[ARG]], label [[FOR_END227:%.*]], label [[FOR_BODY144]]
2222
; CHECK: for.end227:
23-
; CHECK-NEXT: br i1 %arg, label [[FOR_END271:%.*]], label [[FOR_BODY233:%.*]]
23+
; CHECK-NEXT: br i1 [[ARG]], label [[FOR_END271:%.*]], label [[FOR_BODY233:%.*]]
2424
; CHECK: for.body233:
25-
; CHECK-NEXT: br i1 %arg, label [[FOR_BODY233]], label [[FOR_END271]]
25+
; CHECK-NEXT: br i1 [[ARG]], label [[FOR_BODY233]], label [[FOR_END271]]
2626
; CHECK: for.end271:
2727
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ splat (float 0x47EFFFFFE0000000), [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
2828
; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> undef, [[TMP0]]
29-
; CHECK-NEXT: br i1 %arg, label [[IF_THEN291:%.*]], label [[RETURN]]
29+
; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN291:%.*]], label [[RETURN]]
3030
; CHECK: if.then291:
3131
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], splat (float 5.000000e-01)
3232
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP0]], [[TMP2]]
33-
; CHECK-NEXT: br i1 %arg, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]]
33+
; CHECK-NEXT: br i1 [[ARG]], label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]]
3434
; CHECK: if.else319:
35-
; CHECK-NEXT: br i1 %arg, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]]
35+
; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN325:%.*]], label [[IF_END327:%.*]]
3636
; CHECK: if.then325:
3737
; CHECK-NEXT: br label [[IF_END327]]
3838
; CHECK: if.end327:
39-
; CHECK-NEXT: br i1 %arg, label [[IF_THEN329:%.*]], label [[IF_END332]]
39+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> <float poison, float undef>, <2 x i32> <i32 0, i32 3>
40+
; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN329:%.*]], label [[IF_END332]]
4041
; CHECK: if.then329:
4142
; CHECK-NEXT: br label [[IF_END332]]
4243
; CHECK: if.end332:
43-
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP1]], [[IF_THEN329]] ], [ [[TMP1]], [[IF_END327]] ], [ splat (float 0x3F847AE140000000), [[IF_THEN291]] ]
44+
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP6]], [[IF_THEN329]] ], [ [[TMP6]], [[IF_END327]] ], [ splat (float 0x3F847AE140000000), [[IF_THEN291]] ]
4445
; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x float> [[TMP3]], [[TMP4]]
4546
; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[VERTICES:%.*]], align 4
4647
; CHECK-NEXT: br label [[RETURN]]
@@ -49,11 +50,11 @@ define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %ve
4950
; CHECK: if.then17.1:
5051
; CHECK-NEXT: br label [[IF_END22_1]]
5152
; CHECK: if.end22.1:
52-
; CHECK-NEXT: br i1 %arg, label [[IF_THEN17_2:%.*]], label [[IF_END22_2:%.*]]
53+
; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN17_2:%.*]], label [[IF_END22_2:%.*]]
5354
; CHECK: if.then17.2:
5455
; CHECK-NEXT: br label [[IF_END22_2]]
5556
; CHECK: if.end22.2:
56-
; CHECK-NEXT: br i1 %arg, label [[FOR_END36:%.*]], label [[FOR_BODY]]
57+
; CHECK-NEXT: br i1 [[ARG]], label [[FOR_END36:%.*]], label [[FOR_BODY]]
5758
;
5859
entry:
5960
br i1 %arg, label %return, label %if.end

llvm/test/Transforms/SLPVectorizer/X86/matched-bv-schedulable.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ define void @test() {
77
; CHECK-NEXT: br i1 false, label %[[BB1:.*]], label %[[BB5:.*]]
88
; CHECK: [[BB1]]:
99
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], %[[BB1]] ], [ zeroinitializer, %[[BB]] ]
10+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
11+
; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP0]], [[TMP4]]
1012
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
1113
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 0, i32 0
1214
; CHECK-NEXT: [[TMP3]] = or <2 x i32> [[TMP1]], [[TMP2]]
13-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
14-
; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP0]], [[TMP4]]
1515
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
1616
; CHECK-NEXT: [[OR3:%.*]] = or i32 [[TMP6]], 0
1717
; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB5]]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
3+
4+
define i64 @test() {
5+
; CHECK-LABEL: define i64 @test() {
6+
; CHECK-NEXT: [[BB:.*]]:
7+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 0, i32 1
8+
; CHECK-NEXT: br label %[[BB1:.*]]
9+
; CHECK: [[BB1]]:
10+
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ]
11+
; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]]
12+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
13+
; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer
14+
; CHECK-NEXT: br label %[[BB5]]
15+
; CHECK: [[BB5]]:
16+
; CHECK-NEXT: br i1 false, label %[[BB6:.*]], label %[[BB1]]
17+
; CHECK: [[BB6]]:
18+
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP2]], %[[BB5]] ]
19+
; CHECK-NEXT: ret i64 0
20+
;
21+
bb:
22+
br label %bb1
23+
24+
bb1:
25+
%phi = phi i32 [ 0, %bb ], [ %or, %bb5 ]
26+
%phi2 = phi i32 [ 0, %bb ], [ %or4, %bb5 ]
27+
%or = or i32 %phi, 0
28+
%add = add i32 0, 0
29+
%or3 = or i32 %add, %phi2
30+
%or4 = or i32 %or3, 0
31+
br label %bb5
32+
33+
bb5:
34+
br i1 false, label %bb6, label %bb1
35+
36+
bb6:
37+
%phi7 = phi i32 [ %or, %bb5 ]
38+
%phi8 = phi i32 [ %or3, %bb5 ]
39+
ret i64 0
40+
}

llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ define void @test(float %0) {
55
; CHECK-LABEL: define void @test(
66
; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
77
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
8-
; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer
98
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
9+
; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer
1010
; CHECK-NEXT: [[TMP3:%.*]] = fdiv <2 x float> [[TMP6]], zeroinitializer
1111
; CHECK-NEXT: br label %[[BB6:.*]]
1212
; CHECK: [[BB6]]:

llvm/test/Transforms/SLPVectorizer/X86/user-node-with-same-last-instr.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ define void @wombat(i32 %arg) {
1111
; CHECK: [[BB2]]:
1212
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], %[[BB4:.*]] ], [ zeroinitializer, %[[BB1]] ]
1313
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ARG]], i32 0
14+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
1415
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
1516
; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]]
1617
; CHECK-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
17-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
1818
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i32> [[TMP0]], [[TMP5]]
1919
; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP0]], [[TMP5]]
2020
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>

llvm/test/Transforms/SLPVectorizer/revec.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,10 @@ define void @test13(<8 x i32> %0, ptr %out0, ptr %out1, ptr %out2) {
415415
; CHECK-NEXT: entry:
416416
; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> poison, <8 x i32> [[TMP0:%.*]], i64 0)
417417
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
418+
; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> poison, <8 x i32> zeroinitializer, i64 0)
419+
; CHECK-NEXT: [[TMP10:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP9]], <8 x i32> zeroinitializer, i64 8)
420+
; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP10]], <8 x i32> zeroinitializer, i64 16)
421+
; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP5]], <8 x i32> zeroinitializer, i64 24)
418422
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
419423
; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]]
420424
; CHECK: for.end.loopexit:
@@ -454,6 +458,10 @@ define void @test14(<8 x i1> %0) {
454458
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
455459
; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i16>
456460
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
461+
; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> poison, <8 x i16> zeroinitializer, i64 0)
462+
; CHECK-NEXT: [[TMP10:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> [[TMP9]], <8 x i16> zeroinitializer, i64 8)
463+
; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> [[TMP10]], <8 x i16> zeroinitializer, i64 16)
464+
; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> [[TMP7]], <8 x i16> zeroinitializer, i64 24)
457465
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
458466
; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]]
459467
; CHECK: for.end.loopexit:

0 commit comments

Comments
 (0)