Skip to content

Commit 85d15bd

Browse files
authored
[TTI][X86] getMemoryOpCost - reduced costs when loading uniform values due to value reuse (#118642)
Similar to what we do for broadcast shuffles, when legalising load costs, if the value is known to be uniform, then we will only load a single vector and reuse this across the split legalised registers. Fixes #111126
1 parent 1b95e76 commit 85d15bd

File tree

2 files changed

+28
-46
lines changed

2 files changed

+28
-46
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5237,6 +5237,23 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
52375237
CurrOpSizeBytes != 1)
52385238
break; // Try smalled vector size.
52395239

5240+
// This isn't exactly right. We're using slow unaligned 32-byte accesses
5241+
// as a proxy for a double-pumped AVX memory interface such as on
5242+
// Sandybridge.
5243+
// Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5244+
// will be scalarized.
5245+
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5246+
Cost += 2;
5247+
else if (CurrOpSizeBytes < 4)
5248+
Cost += 2;
5249+
else
5250+
Cost += 1;
5251+
5252+
// If we're loading a uniform value, then we don't need to split the load,
5253+
// loading just a single (widest) vector can be reused by all splits.
5254+
if (IsLoad && OpInfo.isUniform())
5255+
return Cost;
5256+
52405257
bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
52415258

52425259
// If we have fully processed the previous reg, we need to replenish it.
@@ -5265,18 +5282,6 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
52655282
!IsLoad, CostKind);
52665283
}
52675284

5268-
// This isn't exactly right. We're using slow unaligned 32-byte accesses
5269-
// as a proxy for a double-pumped AVX memory interface such as on
5270-
// Sandybridge.
5271-
// Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5272-
// will be scalarized.
5273-
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5274-
Cost += 2;
5275-
else if (CurrOpSizeBytes < 4)
5276-
Cost += 2;
5277-
else
5278-
Cost += 1;
5279-
52805285
SubVecEltsLeft -= CurrNumEltPerOp;
52815286
NumEltRemaining -= CurrNumEltPerOp;
52825287
Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);

llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll

Lines changed: 11 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,19 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
3-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
4-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
5-
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
2+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s
3+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s
4+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s
5+
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s
66

77
@arr = global [20 x i64] zeroinitializer, align 16
88

99
define void @PR111126() {
10-
; SSE-LABEL: @PR111126(
11-
; SSE-NEXT: store i64 1, ptr @arr, align 16
12-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
13-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 16), align 16
14-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 24), align 8
15-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
16-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 40), align 8
17-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 48), align 16
18-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 56), align 8
19-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
20-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 72), align 8
21-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 80), align 16
22-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 88), align 8
23-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
24-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 104), align 8
25-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 112), align 16
26-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 120), align 8
27-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
28-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 136), align 8
29-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 144), align 16
30-
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 152), align 8
31-
; SSE-NEXT: ret void
32-
;
33-
; AVX-LABEL: @PR111126(
34-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
35-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
36-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
37-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
38-
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
39-
; AVX-NEXT: ret void
10+
; CHECK-LABEL: @PR111126(
11+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
12+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
13+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
14+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
15+
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
16+
; CHECK-NEXT: ret void
4017
;
4118
store i64 1, ptr @arr, align 16
4219
store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8

0 commit comments

Comments
 (0)