Skip to content

[TTI][X86] getMemoryOpCost - reduced costs when loading uniform values due to value reuse #118642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5237,6 +5237,23 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
CurrOpSizeBytes != 1)
break; // Try smalled vector size.

// This isn't exactly right. We're using slow unaligned 32-byte accesses
// as a proxy for a double-pumped AVX memory interface such as on
// Sandybridge.
// Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
// will be scalarized.
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
Cost += 2;
else if (CurrOpSizeBytes < 4)
Cost += 2;
else
Cost += 1;

// If we're loading a uniform value, then we don't need to split the load,
// loading just a single (widest) vector can be reused by all splits.
if (IsLoad && OpInfo.isUniform())
return Cost;

bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;

// If we have fully processed the previous reg, we need to replenish it.
Expand Down Expand Up @@ -5265,18 +5282,6 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
!IsLoad, CostKind);
}

// This isn't exactly right. We're using slow unaligned 32-byte accesses
// as a proxy for a double-pumped AVX memory interface such as on
// Sandybridge.
// Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
// will be scalarized.
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
Cost += 2;
else if (CurrOpSizeBytes < 4)
Cost += 2;
else
Cost += 1;

SubVecEltsLeft -= CurrNumEltPerOp;
NumEltRemaining -= CurrNumEltPerOp;
Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
Expand Down
45 changes: 11 additions & 34 deletions llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll
Original file line number Diff line number Diff line change
@@ -1,42 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64 | FileCheck %s
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s
; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s

@arr = global [20 x i64] zeroinitializer, align 16

define void @PR111126() {
; SSE-LABEL: @PR111126(
; SSE-NEXT: store i64 1, ptr @arr, align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 16), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 24), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 40), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 48), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 56), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 72), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 80), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 88), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 104), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 112), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 120), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 136), align 8
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 144), align 16
; SSE-NEXT: store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 152), align 8
; SSE-NEXT: ret void
;
; AVX-LABEL: @PR111126(
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
; AVX-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
; AVX-NEXT: ret void
; CHECK-LABEL: @PR111126(
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr @arr, align 16
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
; CHECK-NEXT: store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
; CHECK-NEXT: ret void
;
store i64 1, ptr @arr, align 16
store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
Expand Down
Loading