Skip to content

Commit 39bab1d

Browse files
committed
[SLP]Check if the operand for removal is the reduction operand, awaiting for the reduction
If the operand of the instruction-to-be-removed is a reduction value, which is not reduced yet, and, thus, it has no users, it may be removed during operands analysis. Fixes #128736
1 parent 0212834 commit 39bab1d

File tree

2 files changed

+97
-13
lines changed

2 files changed

+97
-13
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1389,9 +1389,10 @@ class BoUpSLP {
13891389
/// Vectorize the tree but with the list of externally used values \p
13901390
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
13911391
/// generated extractvalue instructions.
1392-
Value *
1393-
vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1394-
Instruction *ReductionRoot = nullptr);
1392+
Value *vectorizeTree(
1393+
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1394+
Instruction *ReductionRoot = nullptr,
1395+
ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
13951396

13961397
/// \returns the cost incurred by unwanted spills and fills, caused by
13971398
/// holding live values over call sites.
@@ -2849,11 +2850,13 @@ class BoUpSLP {
28492850
/// Remove instructions from the parent function and clear the operands of \p
28502851
/// DeadVals instructions, marking for deletion trivially dead operands.
28512852
template <typename T>
2852-
void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2853+
void removeInstructionsAndOperands(
2854+
ArrayRef<T *> DeadVals,
2855+
ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
28532856
SmallVector<WeakTrackingVH> DeadInsts;
28542857
for (T *V : DeadVals) {
28552858
auto *I = cast<Instruction>(V);
2856-
DeletedInstructions.insert(I);
2859+
eraseInstruction(I);
28572860
}
28582861
DenseSet<Value *> Processed;
28592862
for (T *V : DeadVals) {
@@ -2915,12 +2918,17 @@ class BoUpSLP {
29152918
// loop iteration.
29162919
if (auto *OpI = dyn_cast<Instruction>(OpV))
29172920
if (!DeletedInstructions.contains(OpI) &&
2921+
(!OpI->getType()->isVectorTy() ||
2922+
none_of(VectorValuesAndScales,
2923+
[&](const std::tuple<Value *, unsigned, bool> &V) {
2924+
return std::get<0>(V) == OpI;
2925+
})) &&
29182926
isInstructionTriviallyDead(OpI, TLI))
29192927
DeadInsts.push_back(OpI);
29202928
}
29212929

29222930
VI->removeFromParent();
2923-
DeletedInstructions.insert(VI);
2931+
eraseInstruction(VI);
29242932
SE->forgetValue(VI);
29252933
}
29262934
}
@@ -16466,9 +16474,10 @@ Value *BoUpSLP::vectorizeTree() {
1646616474
return vectorizeTree(ExternallyUsedValues);
1646716475
}
1646816476

16469-
Value *
16470-
BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
16471-
Instruction *ReductionRoot) {
16477+
Value *BoUpSLP::vectorizeTree(
16478+
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
16479+
Instruction *ReductionRoot,
16480+
ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
1647216481
// All blocks must be scheduled before any instructions are inserted.
1647316482
for (auto &BSIter : BlocksSchedules) {
1647416483
scheduleBlock(BSIter.second.get());
@@ -17075,7 +17084,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1707517084
// cache correctness.
1707617085
// NOTE: removeInstructionAndOperands only marks the instruction for deletion
1707717086
// - instructions are not deleted until later.
17078-
removeInstructionsAndOperands(ArrayRef(RemovedInsts));
17087+
removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
1707917088

1708017089
Builder.ClearInsertionPoint();
1708117090
InstrElementSize.clear();
@@ -20449,8 +20458,8 @@ class HorizontalReduction {
2044920458
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
2045020459

2045120460
// Vectorize a tree.
20452-
Value *VectorizedRoot =
20453-
V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20461+
Value *VectorizedRoot = V.vectorizeTree(
20462+
LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
2045420463
// Update TrackedToOrig mapping, since the tracked values might be
2045520464
// updated.
2045620465
for (Value *RdxVal : Candidates) {
@@ -20678,7 +20687,7 @@ class HorizontalReduction {
2067820687
Ignore->replaceAllUsesWith(P);
2067920688
}
2068020689
}
20681-
V.removeInstructionsAndOperands(RdxOps);
20690+
V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
2068220691
}
2068320692
} else if (!CheckForReusedReductionOps) {
2068420693
for (ReductionOpsType &RdxOps : ReductionOps)
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver2 < %s | FileCheck %s
3+
4+
define i32 @test(i32 %arg) {
5+
; CHECK-LABEL: define i32 @test(
6+
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[BB:.*]]:
8+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, i32 [[ARG]], i32 1
9+
; CHECK-NEXT: br label %[[BB1:.*]]
10+
; CHECK: [[BB1]]:
11+
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX:%.*]], %[[BB1]] ]
12+
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> zeroinitializer, i64 2)
13+
; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i64> zeroinitializer, [[TMP1]]
14+
; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32>
15+
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> zeroinitializer, [[TMP3]]
16+
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]]
17+
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
18+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
19+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
20+
; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> zeroinitializer, [[TMP8]]
21+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
22+
; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP7]], i64 0)
23+
; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP11]], [[TMP10]]
24+
; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[RDX_OP]], i64 0)
25+
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP12]])
26+
; CHECK-NEXT: [[OP_RDX]] = mul i32 0, [[TMP13]]
27+
; CHECK-NEXT: br label %[[BB1]]
28+
;
29+
bb:
30+
br label %bb1
31+
32+
bb1:
33+
%phi = phi i32 [ 0, %bb ], [ %mul37, %bb1 ]
34+
%mul = mul i64 0, 0
35+
%trunc = trunc i64 %mul to i32
36+
%or = or i32 0, %trunc
37+
%or2 = or i32 0, %or
38+
%or3 = or i32 %or2, 0
39+
%mul4 = mul i32 0, %or3
40+
%mul5 = mul i32 %or3, 0
41+
%mul6 = mul i32 %mul5, %mul4
42+
%mul7 = mul i32 %mul6, %mul4
43+
%mul8 = mul i32 %mul7, %or3
44+
%mul9 = mul i64 0, 0
45+
%trunc10 = trunc i64 %mul9 to i32
46+
%or11 = or i32 0, %trunc10
47+
%or12 = or i32 %arg, %or11
48+
%or13 = or i32 %or12, 0
49+
%mul14 = mul i32 %or13, %mul8
50+
%mul15 = mul i32 %mul14, 0
51+
%mul16 = mul i32 %mul15, 0
52+
%mul17 = mul i32 %mul16, %or13
53+
%shl = shl i64 0, 0
54+
%mul18 = mul i64 %shl, 0
55+
%trunc19 = trunc i64 %mul18 to i32
56+
%or20 = or i32 0, %trunc19
57+
%or21 = or i32 0, %or20
58+
%or22 = or i32 %or21, 0
59+
%mul23 = mul i32 %or22, %mul17
60+
%mul24 = mul i32 %mul23, 0
61+
%mul25 = mul i32 %mul24, 0
62+
%mul26 = mul i32 %mul25, %or22
63+
%shl27 = shl i64 0, 0
64+
%mul28 = mul i64 %shl27, 0
65+
%trunc29 = trunc i64 %mul28 to i32
66+
%or30 = or i32 0, %trunc29
67+
%or31 = or i32 0, %or30
68+
%or32 = or i32 %or31, 0
69+
%mul33 = mul i32 0, %or32
70+
%mul34 = mul i32 %or32, %mul26
71+
%mul35 = mul i32 %mul34, %mul33
72+
%mul36 = mul i32 %mul35, %mul33
73+
%mul37 = mul i32 %mul36, %or32
74+
br label %bb1
75+
}

0 commit comments

Comments
 (0)