Skip to content

Commit 1e1c8d1

Browse files
[SLP]Add external uses cost for the gathered loads.
If the load is a part of the gather node and also a part of the vectorized subvector, need to add the estimation for the non-vectorized external uses. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #99889
1 parent 6808e6c commit 1e1c8d1

File tree

3 files changed

+32
-28
lines changed

3 files changed

+32
-28
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8363,6 +8363,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
83638363
: TTI.getStridedMemoryOpCost(
83648364
Instruction::Load, LoadTy, LI->getPointerOperand(),
83658365
/*VariableMask=*/false, Alignment, CostKind, LI);
8366+
// Add external uses costs.
8367+
for (auto [Idx, V] : enumerate(VL.slice(
8368+
P.first, std::min<unsigned>(VL.size() - P.first, VF))))
8369+
if (!R.areAllUsersVectorized(cast<Instruction>(V)))
8370+
GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
8371+
LoadTy, CostKind, Idx);
83668372
// Estimate GEP cost.
83678373
SmallVector<Value *> PointerOps(VF);
83688374
for (auto [I, V] : enumerate(VL.slice(P.first, VF)))

llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,19 @@ define i16 @test() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
88
; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
9-
; CHECK-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
109
; CHECK-NEXT: br label [[WHILE:%.*]]
1110
; CHECK: while:
12-
; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
11+
; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX26:%.*]], [[WHILE]] ]
1312
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8
1413
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
15-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
16-
; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8
17-
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
18-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
19-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
20-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
21-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
22-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
23-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
24-
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
25-
; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
14+
; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr null, align 8
15+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
16+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
17+
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
18+
; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 0, [[TMP2]]
19+
; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP1]]
20+
; CHECK-NEXT: [[OP_RDX25:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX24]]
21+
; CHECK-NEXT: [[OP_RDX26]] = xor i64 [[OP_RDX25]], [[TMP5]]
2622
; CHECK-NEXT: br label [[WHILE]]
2723
;
2824
entry:

llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) {
77
; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
88
; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4
99
; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
10+
; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
1011
; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
1112
; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4
12-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
13+
; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
14+
; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
1315
; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
1416
; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4
1517
; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -19,11 +21,10 @@ define void @test(ptr nocapture %t2) {
1921
; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
2022
; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
2123
; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
22-
; CHECK-NEXT: [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
23-
; CHECK-NEXT: [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
2424
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
2525
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
2626
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
27+
; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270
2728
; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137
2829
; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
2930
; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -33,19 +34,20 @@ define void @test(ptr nocapture %t2) {
3334
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
3435
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
3536
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
36-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
37-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
38-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
39-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
40-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
41-
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
42-
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
43-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
44-
; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
45-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
46-
; CHECK-NEXT: [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
37+
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
38+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
39+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
40+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
41+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
42+
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
43+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
44+
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
45+
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
46+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
47+
; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
4748
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
48-
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
49+
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
50+
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4951
; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4
5052
; CHECK-NEXT: ret void
5153
;

0 commit comments

Comments
 (0)