Skip to content

Commit 076318b

Browse files
committed
[SLP]Use proper order when calculating costs for geps/extracts to correctly identify profitability
Need to reorder properly the scalars, when evaluating the costs for the external uses/geps to prevent differences in the calculating of the profitability costs, used to choose between gather/compressed loads. Fixes #132099 (comment)
1 parent fb73086 commit 076318b

File tree

2 files changed

+43
-8
lines changed

2 files changed

+43
-8
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5491,12 +5491,16 @@ static bool isMaskedLoadCompress(
54915491
const unsigned Sz = VL.size();
54925492
auto *VecTy = getWidenedType(ScalarTy, Sz);
54935493
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5494+
SmallVector<int> Mask;
5495+
if (!Order.empty())
5496+
inversePermutation(Order, Mask);
54945497
// Check external uses.
54955498
for (const auto [I, V] : enumerate(VL)) {
54965499
if (AreAllUsersVectorized(V))
54975500
continue;
54985501
InstructionCost ExtractCost =
5499-
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
5502+
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
5503+
Mask.empty() ? I : Mask[I]);
55005504
InstructionCost ScalarCost =
55015505
TTI.getInstructionCost(cast<Instruction>(V), CostKind);
55025506
if (ExtractCost <= ScalarCost)
@@ -5536,8 +5540,11 @@ static bool isMaskedLoadCompress(
55365540
bool IsStrided =
55375541
buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
55385542
assert(CompressMask.size() >= 2 && "At least two elements are required");
5543+
SmallVector<Value *> OrderedPointerOps(PointerOps);
5544+
if (!Order.empty())
5545+
reorderScalars(OrderedPointerOps, Mask);
55395546
auto [ScalarGEPCost, VectorGEPCost] =
5540-
getGEPCosts(TTI, PointerOps, PointerOps.front(),
5547+
getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
55415548
Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
55425549
// The cost of scalar loads.
55435550
InstructionCost ScalarLoadsCost =
@@ -5564,17 +5571,16 @@ static bool isMaskedLoadCompress(
55645571
TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
55655572
LI->getPointerAddressSpace(), CostKind);
55665573
}
5567-
SmallVector<int> Mask;
5568-
if (!Order.empty())
5569-
inversePermutation(Order, Mask);
55705574
if (IsStrided) {
55715575
// Check for potential segmented(interleaved) loads.
55725576
if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
55735577
CommonAlignment,
55745578
LI->getPointerAddressSpace())) {
5575-
InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
5576-
Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt,
5577-
CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked);
5579+
InstructionCost InterleavedCost =
5580+
VectorGEPCost + TTI.getInterleavedMemoryOpCost(
5581+
Instruction::Load, LoadVecTy, CompressMask[1],
5582+
std::nullopt, CommonAlignment,
5583+
LI->getPointerAddressSpace(), CostKind, IsMasked);
55785584
if (!Mask.empty())
55795585
InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
55805586
VecTy, Mask, CostKind);
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s -mcpu=neoverse-512tvb | FileCheck %s
3+
4+
define i32 @test(ptr %0, i64 %1) vscale_range(2,2) {
5+
; CHECK-LABEL: define i32 @test(
6+
; CHECK-SAME: ptr [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[ENTRY:.*:]]
8+
; CHECK-NEXT: br label %[[FOR_BODY48:.*]]
9+
; CHECK: [[FOR_BODY48]]:
10+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[TMP1]]
11+
; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
12+
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX52]], align 4
13+
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP2]], align 4
14+
; CHECK-NEXT: [[ADD56:%.*]] = fadd float [[TMP3]], [[TMP4]]
15+
; CHECK-NEXT: store float [[ADD56]], ptr [[TMP0]], align 4
16+
; CHECK-NEXT: br label %[[FOR_BODY48]]
17+
;
18+
entry:
19+
br label %for.body48
20+
21+
for.body48:
22+
%2 = getelementptr float, ptr %0, i64 %1
23+
%arrayidx52 = getelementptr i8, ptr %2, i64 8
24+
%3 = load float, ptr %arrayidx52, align 4
25+
%4 = load float, ptr %2, align 4
26+
%add56 = fadd float %3, %4
27+
store float %add56, ptr %0, align 4
28+
br label %for.body48
29+
}

0 commit comments

Comments
 (0)