[TTI][X86] getMemoryOpCost - reduced costs when loading uniform values due to value reuse (#118642)

RKSimon · web-flow · commit 85d15bd13093 · 2024-12-04T16:36:00.000Z
Similar to what we do for broadcast shuffles, when legalising load costs, if the value is known to be uniform, then we will only load a single vector and reuse this across the split legalised registers. Fixes #111126
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5237,6 +5237,23 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
           CurrOpSizeBytes != 1)
         break; // Try smalled vector size.
 
+      // This isn't exactly right. We're using slow unaligned 32-byte accesses
+      // as a proxy for a double-pumped AVX memory interface such as on
+      // Sandybridge.
+      // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
+      // will be scalarized.
+      if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
+        Cost += 2;
+      else if (CurrOpSizeBytes < 4)
+        Cost += 2;
+      else
+        Cost += 1;
+
+      // If we're loading a uniform value, then we don't need to split the load,
+      // loading just a single (widest) vector can be reused by all splits.
+      if (IsLoad && OpInfo.isUniform())
+        return Cost;
+
       bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
 
       // If we have fully processed the previous reg, we need to replenish it.
@@ -5265,18 +5282,6 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          !IsLoad, CostKind);
       }
 
-      // This isn't exactly right. We're using slow unaligned 32-byte accesses
-      // as a proxy for a double-pumped AVX memory interface such as on
-      // Sandybridge.
-      // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
-      // will be scalarized.
-      if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
-        Cost += 2;
-      else if (CurrOpSizeBytes < 4)
-        Cost += 2;
-      else
-        Cost += 1;
-
       SubVecEltsLeft -= CurrNumEltPerOp;
       NumEltRemaining -= CurrNumEltPerOp;
       Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-constant.ll
@@ -1,42 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64    | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v2 | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v3 | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-- -passes=slp-vectorizer -mcpu=x86-64-v4 | FileCheck %s
 
 @arr = global [20 x i64] zeroinitializer, align 16
 
 define void @PR111126() {
-; SSE-LABEL: @PR111126(
-; SSE-NEXT:    store i64 1, ptr @arr, align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 16), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 24), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 40), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 48), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 56), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 72), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 80), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 88), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 104), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 112), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 120), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 136), align 8
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 144), align 16
-; SSE-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 152), align 8
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @PR111126(
-; AVX-NEXT:    store <4 x i64> splat (i64 1), ptr @arr, align 16
-; AVX-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
-; AVX-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
-; AVX-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
-; AVX-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @PR111126(
+; CHECK-NEXT:    store <4 x i64> splat (i64 1), ptr @arr, align 16
+; CHECK-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 32), align 16
+; CHECK-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 64), align 16
+; CHECK-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 96), align 16
+; CHECK-NEXT:    store <4 x i64> splat (i64 1), ptr getelementptr inbounds (i8, ptr @arr, i64 128), align 16
+; CHECK-NEXT:    ret void
 ;
   store i64 1, ptr @arr, align 16
   store i64 1, ptr getelementptr inbounds (i8, ptr @arr, i64 8), align 8