[Instructions] cache computed shufflevector properties #115536

Prince781 · 2024-11-08T19:06:34Z

Cache computed properties of a shufflevector mask.

llvmbot · 2024-11-08T19:07:11Z

@llvm/pr-subscribers-llvm-ir

Author: Princeton Ferro (Prince781)

Changes

Cache computed properties of a shufflevector mask.

Full diff: https://github.com/llvm/llvm-project/pull/115536.diff

2 Files Affected:

(modified) llvm/include/llvm/IR/Instructions.h (+76-9)
(modified) llvm/lib/IR/Instructions.cpp (+26-12)

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index b6575d4c85724c..c60f2f021b3c89 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1848,6 +1848,39 @@ class ShuffleVectorInst : public Instruction {
 
   SmallVector<int, 4> ShuffleMask;
   Constant *ShuffleMaskForBitcode;
+  struct ShuffleProperties {
+    bool isSingleSource : 1;
+    bool isSingleSource_set : 1;
+    bool isIdentityMask : 1;
+    bool isIdentityMask_set : 1;
+    bool isIdentityWithPadding : 1;
+    bool isIdentityWithPadding_set : 1;
+    bool isIdentityWithExtract : 1;
+    bool isIdentityWithExtract_set : 1;
+    bool isConcat : 1;
+    bool isConcat_set : 1;
+    bool isSelect : 1;
+    bool isSelect_set : 1;
+    bool isReverse : 1;
+    bool isReverse_set : 1;
+    bool isZeroEltSplat : 1;
+    bool isZeroEltSplat_set : 1;
+    bool isTranspose : 1;
+    bool isTranspose_set : 1;
+
+    void unset() {
+      isSingleSource_set = false;
+      isIdentityMask_set = false;
+      isIdentityWithPadding_set = false;
+      isIdentityWithExtract_set = false;
+      isConcat_set = false;
+      isSelect_set = false;
+      isReverse_set = false;
+      isZeroEltSplat_set = false;
+      isTranspose_set = false;
+    }
+  };
+  mutable ShuffleProperties CachedShuffleProperties = {};
 
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
@@ -1959,8 +1992,13 @@ class ShuffleVectorInst : public Instruction {
   /// Example: shufflevector <4 x n> A, <4 x n> B, <3,0,undef,3>
   /// TODO: Optionally allow length-changing shuffles.
   bool isSingleSource() const {
-    return !changesLength() &&
-           isSingleSourceMask(ShuffleMask, ShuffleMask.size());
+    if (CachedShuffleProperties.isSingleSource_set)
+      return CachedShuffleProperties.isSingleSource;
+
+    CachedShuffleProperties.isSingleSource_set = true;
+    return CachedShuffleProperties.isSingleSource =
+               !changesLength() &&
+               isSingleSourceMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask chooses elements from exactly one source
@@ -1987,12 +2025,18 @@ class ShuffleVectorInst : public Instruction {
   /// from its input vectors.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <4,undef,6,undef>
   bool isIdentity() const {
+    if (CachedShuffleProperties.isIdentityMask_set)
+      return CachedShuffleProperties.isIdentityMask;
+
+    CachedShuffleProperties.isIdentityMask_set = true;
     // Not possible to express a shuffle mask for a scalable vector for this
     // case.
     if (isa<ScalableVectorType>(getType()))
-      return false;
+      return CachedShuffleProperties.isIdentityMask = false;
 
-    return !changesLength() && isIdentityMask(ShuffleMask, ShuffleMask.size());
+    return CachedShuffleProperties.isIdentityMask =
+               !changesLength() &&
+               isIdentityMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle lengthens exactly one source vector with
@@ -2033,7 +2077,13 @@ class ShuffleVectorInst : public Instruction {
   /// In that case, the shuffle is better classified as an identity shuffle.
   /// TODO: Optionally allow length-changing shuffles.
   bool isSelect() const {
-    return !changesLength() && isSelectMask(ShuffleMask, ShuffleMask.size());
+    if (CachedShuffleProperties.isSelect_set)
+      return CachedShuffleProperties.isSelect;
+
+    CachedShuffleProperties.isSelect_set = true;
+    return CachedShuffleProperties.isSelect =
+               !changesLength() &&
+               isSelectMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask swaps the order of elements from exactly
@@ -2054,7 +2104,13 @@ class ShuffleVectorInst : public Instruction {
   /// Example: shufflevector <4 x n> A, <4 x n> B, <3,undef,1,undef>
   /// TODO: Optionally allow length-changing shuffles.
   bool isReverse() const {
-    return !changesLength() && isReverseMask(ShuffleMask, ShuffleMask.size());
+    if (CachedShuffleProperties.isReverse_set)
+      return CachedShuffleProperties.isReverse;
+
+    CachedShuffleProperties.isReverse_set = true;
+    return CachedShuffleProperties.isReverse =
+               !changesLength() &&
+               isReverseMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask chooses all elements with the same value
@@ -2077,8 +2133,13 @@ class ShuffleVectorInst : public Instruction {
   /// TODO: Optionally allow length-changing shuffles.
   /// TODO: Optionally allow splats from other elements.
   bool isZeroEltSplat() const {
-    return !changesLength() &&
-           isZeroEltSplatMask(ShuffleMask, ShuffleMask.size());
+    if (CachedShuffleProperties.isZeroEltSplat_set)
+      return CachedShuffleProperties.isZeroEltSplat;
+
+    CachedShuffleProperties.isZeroEltSplat_set = true;
+    return CachedShuffleProperties.isZeroEltSplat =
+               !changesLength() &&
+               isZeroEltSplatMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask is a transpose mask.
@@ -2127,7 +2188,13 @@ class ShuffleVectorInst : public Instruction {
   /// exact specification.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <0,4,2,6>
   bool isTranspose() const {
-    return !changesLength() && isTransposeMask(ShuffleMask, ShuffleMask.size());
+    if (CachedShuffleProperties.isTranspose_set)
+      return CachedShuffleProperties.isTranspose;
+
+    CachedShuffleProperties.isTranspose_set = true;
+    return CachedShuffleProperties.isTranspose =
+               !changesLength() &&
+               isTransposeMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask is a splice mask, concatenating the two
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 009e0c03957c97..987bf2a031fd2d 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1810,6 +1810,7 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
 }
 
 void ShuffleVectorInst::setShuffleMask(ArrayRef<int> Mask) {
+  CachedShuffleProperties.unset();
   ShuffleMask.assign(Mask.begin(), Mask.end());
   ShuffleMaskForBitcode = convertShuffleMaskForBitcode(Mask, getType());
 }
@@ -2100,63 +2101,76 @@ bool ShuffleVectorInst::isInsertSubvectorMask(ArrayRef<int> Mask,
 }
 
 bool ShuffleVectorInst::isIdentityWithPadding() const {
+  if (CachedShuffleProperties.isIdentityWithPadding_set)
+    return CachedShuffleProperties.isIdentityWithPadding;
+
+  CachedShuffleProperties.isIdentityWithPadding_set = true;
   // FIXME: Not currently possible to express a shuffle mask for a scalable
   // vector for this case.
   if (isa<ScalableVectorType>(getType()))
-    return false;
+    return CachedShuffleProperties.isIdentityWithPadding = false;
 
   int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
   int NumMaskElts = cast<FixedVectorType>(getType())->getNumElements();
   if (NumMaskElts <= NumOpElts)
-    return false;
+    return CachedShuffleProperties.isIdentityWithPadding = false;
 
   // The first part of the mask must choose elements from exactly 1 source op.
   ArrayRef<int> Mask = getShuffleMask();
   if (!isIdentityMaskImpl(Mask, NumOpElts))
-    return false;
+    return CachedShuffleProperties.isIdentityWithPadding = false;
 
   // All extending must be with undef elements.
   for (int i = NumOpElts; i < NumMaskElts; ++i)
     if (Mask[i] != -1)
-      return false;
+      return CachedShuffleProperties.isIdentityWithPadding = false;
 
-  return true;
+  return CachedShuffleProperties.isIdentityWithPadding = true;
 }
 
 bool ShuffleVectorInst::isIdentityWithExtract() const {
+  if (CachedShuffleProperties.isIdentityWithExtract_set)
+    return CachedShuffleProperties.isIdentityWithExtract;
+
+  CachedShuffleProperties.isIdentityWithExtract_set = true;
   // FIXME: Not currently possible to express a shuffle mask for a scalable
   // vector for this case.
   if (isa<ScalableVectorType>(getType()))
-    return false;
+    return CachedShuffleProperties.isIdentityWithExtract = false;
 
   int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
   int NumMaskElts = cast<FixedVectorType>(getType())->getNumElements();
   if (NumMaskElts >= NumOpElts)
-    return false;
+    return CachedShuffleProperties.isIdentityWithExtract = false;
 
-  return isIdentityMaskImpl(getShuffleMask(), NumOpElts);
+  return CachedShuffleProperties.isIdentityWithExtract = isIdentityMaskImpl(getShuffleMask(), NumOpElts);
 }
 
 bool ShuffleVectorInst::isConcat() const {
+  if (CachedShuffleProperties.isConcat_set)
+    return CachedShuffleProperties.isConcat;
+
+  CachedShuffleProperties.isConcat_set = true;
   // Vector concatenation is differentiated from identity with padding.
   if (isa<UndefValue>(Op<0>()) || isa<UndefValue>(Op<1>()))
-    return false;
+    return CachedShuffleProperties.isConcat = false;
 
   // FIXME: Not currently possible to express a shuffle mask for a scalable
   // vector for this case.
   if (isa<ScalableVectorType>(getType()))
-    return false;
+    return CachedShuffleProperties.isConcat = false;
 
   int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
   int NumMaskElts = cast<FixedVectorType>(getType())->getNumElements();
   if (NumMaskElts != NumOpElts * 2)
-    return false;
+    return CachedShuffleProperties.isConcat = false;
 
   // Use the mask length rather than the operands' vector lengths here. We
   // already know that the shuffle returns a vector twice as long as the inputs,
   // and neither of the inputs are undef vectors. If the mask picks consecutive
   // elements from both inputs, then this is a concatenation of the inputs.
-  return isIdentityMaskImpl(getShuffleMask(), NumMaskElts);
+  return CachedShuffleProperties.isConcat =
+             isIdentityMaskImpl(getShuffleMask(), NumMaskElts);
 }
 
 static bool isReplicationMaskWithParams(ArrayRef<int> Mask,

github-actions · 2024-11-08T19:10:36Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Prince781 · 2024-11-13T09:00:26Z

@nikic If I'm not misreading these build logs, the errors look unrelated. Would it be possible for you to assign someone to review this?

nikic

I didn't see any compile-time impact when testing this, but probably this needs a fairly specific workload to see benefits. Could you maybe provide some numbers on what you see?

In terms of implementation, I do wonder whether it would make sense to always compute the full properties on construction / mask change, as the lazy initialization makes this somewhat ugly.

Prince781 · 2024-11-14T18:28:31Z

Hi @nikic,

We have some internal kernels that make heavy use of very large vectors (256, 512, 1024 elements). We find a 20-40% compile time reduction on these by caching shufflevector mask properties. I think this impact is hard to see with smaller sizes.

Prince781 · 2024-11-15T22:21:42Z

The lazy initialization is helpful for our compile time cases, so I'd prefer to keep it. I can refine this patch to be easier to read.

RKSimon · 2024-11-16T16:43:33Z

We could consider creating a TTI::ShuffleKindSet mask type and create the entire set in one call in the ShuffleVectorInst constructor - the ShuffleVectorInst::is*Mask calls are all very similar, it shouldn't take much to merge them all into a single analysis loop.

Prince781 · 2024-11-16T17:03:11Z

@RKSimon interesting idea. I'll try that out.

Prince781 · 2024-12-01T01:03:45Z

@RKSimon why would it be in TTI though? Aren't these target-independent properties?

Prince781 · 2024-12-01T04:32:07Z

Nevermind, I see that ShuffleVectorInst methods are used in BasicTTIImpl.h

Prince781 · 2024-12-09T05:34:51Z

Hi all,

Updated the PR to merge shufflevector attribute analyses into a single pass over the mask. I've left other users (like BasicTTIImpl) alone for now, as I don't have a real-world example to motivate changing them.

Prince781 · 2024-12-09T18:08:26Z

Improvements on our (@NVIDIA's) end:

before:

===-------------------------------------------------------------------------===
                          Pass execution timing report
===-------------------------------------------------------------------------===
  Total Execution Time: 76.5499 seconds (76.5545 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
  72.8495 ( 96.6%)   1.0520 ( 94.9%)  73.9014 ( 96.5%)  73.9059 ( 96.5%)  InstCombinePass #2
   2.1374 (  2.8%)   0.0359 (  3.2%)   2.1733 (  2.8%)   2.1734 (  2.8%)  InstCombinePass #1
   0.0968 (  0.1%)   0.0000 (  0.0%)   0.0968 (  0.1%)   0.0968 (  0.1%)  GVNPass #2
   0.0545 (  0.1%)   0.0120 (  1.1%)   0.0665 (  0.1%)   0.0666 (  0.1%)  SROAPass #1
...

after this change:

===-------------------------------------------------------------------------===
                          Pass execution timing report
===-------------------------------------------------------------------------===
  Total Execution Time: 60.3458 seconds (60.3485 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
  56.9568 ( 95.8%)   0.8480 ( 95.5%)  57.8048 ( 95.8%)  57.8074 ( 95.8%)  InstCombinePass #2
   2.0506 (  3.4%)   0.0239 (  2.7%)   2.0745 (  3.4%)   2.0746 (  3.4%)  InstCombinePass #1
   0.0934 (  0.2%)   0.0040 (  0.5%)   0.0974 (  0.2%)   0.0974 (  0.2%)  GVNPass #2
   0.0542 (  0.1%)   0.0120 (  1.3%)   0.0662 (  0.1%)   0.0662 (  0.1%)  SROAPass #1
...

after this change with -instcombine-simplify-vector-elts-depth=1:

===-------------------------------------------------------------------------===
                          Pass execution timing report
===-------------------------------------------------------------------------===
  Total Execution Time: 22.5138 seconds (22.5148 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
  21.7351 ( 96.6%)   0.0000 (  0.0%)  21.7351 ( 96.5%)  21.7360 ( 96.5%)  InstCombinePass #2
   0.4445 (  2.0%)   0.0037 ( 24.0%)   0.4482 (  2.0%)   0.4482 (  2.0%)  InstCombinePass #1
   0.0980 (  0.4%)   0.0000 (  0.0%)   0.0980 (  0.4%)   0.0980 (  0.4%)  GVNPass #2
   0.0729 (  0.3%)   0.0080 ( 52.8%)   0.0809 (  0.4%)   0.0809 (  0.4%)  SROAPass #1
...

Prince781 · 2024-12-16T14:59:45Z

Ping.

nikic

It may be cleaner to separate out the scalable case into a separate code path, because there is only a single supported scalable shufflevector operation (which is the zero index splat).

llvm/lib/IR/Instructions.cpp

- Cache computed properties of a shufflevector mask. - Merge several shuffle mask property analyses into a single analysis and introduce ShuffleMaskAttrs. - Compute the properties on shufflevector construction.

This reverts commit 24a39050595b7f32e7374f67d281d91bfd3cf303.

Simplify implementation

Prince781 · 2025-01-23T00:38:15Z

Ping reviewers.

nikic

LGTM

nikic · 2025-01-24T15:52:10Z

llvm/include/llvm/IR/Instructions.h

+
+  // (Re)compute the shuffle mask attributes.
+  void computeShuffleAttrs() {
+    ShuffleAttrs = {}; // reset


Not needed, as you're reassigning all members anyway.

Prince781 · 2025-01-24T23:31:07Z

Just in case, please hold off on merging as I need to do some more compile time testing.

RKSimon

LGTM cheers - @Prince781 hows the compile time checking going?

Prince781 · 2025-01-28T17:27:43Z

Hi @RKSimon

While this improves the original case I was interested in, I'm finding with a newer kernel we recently added that the 8b size increase for ShuffleVectorInst is causing (CPU) caching problems. So I'd like to still hold off on merging.

llvmbot added the llvm:ir label Nov 8, 2024

Prince781 force-pushed the dev/pferro/shufflevector-property-caching branch 2 times, most recently from a44d200 to 6a332ad Compare November 12, 2024 08:45

nikic requested review from RKSimon and alexey-bataev November 13, 2024 09:17

nikic reviewed Nov 13, 2024

View reviewed changes

Prince781 force-pushed the dev/pferro/shufflevector-property-caching branch from aa7849a to 18ac1aa Compare November 16, 2024 17:46

Prince781 force-pushed the dev/pferro/shufflevector-property-caching branch 2 times, most recently from bfa9cb1 to 8e12ade Compare December 8, 2024 02:40

nikic reviewed Dec 17, 2024

View reviewed changes

RKSimon reviewed Dec 17, 2024

View reviewed changes

llvm/lib/IR/Instructions.cpp Outdated Show resolved Hide resolved

Prince781 force-pushed the dev/pferro/shufflevector-property-caching branch 4 times, most recently from 04b15aa to b6fd581 Compare January 22, 2025 09:12

Prince781 added 3 commits January 22, 2025 13:23

[Instructions] cache computed shufflevector properties

4a0ab38

- Cache computed properties of a shufflevector mask. - Merge several shuffle mask property analyses into a single analysis and introduce ShuffleMaskAttrs. - Compute the properties on shufflevector construction.

Revert "[Instructions] cache computed shufflevector properties"

4666be3

This reverts commit 24a39050595b7f32e7374f67d281d91bfd3cf303.

[Instructions] reimplement shufflevector property caching

bd5152a

Simplify implementation

remove unused fields in ShuffleMaskAttrs

3fd0f67

Prince781 force-pushed the dev/pferro/shufflevector-property-caching branch from b6fd581 to 3fd0f67 Compare January 22, 2025 21:24

remove mutable

612ba77

nikic approved these changes Jan 24, 2025

View reviewed changes

RKSimon approved these changes Jan 28, 2025

View reviewed changes

Prince781 marked this pull request as draft February 6, 2025 01:54

[Instructions] cache computed shufflevector properties #115536

Are you sure you want to change the base?

[Instructions] cache computed shufflevector properties #115536

Conversation

Prince781 commented Nov 8, 2024

Uh oh!

llvmbot commented Nov 8, 2024

Uh oh!

github-actions bot commented Nov 8, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Prince781 commented Nov 13, 2024

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Prince781 commented Nov 14, 2024

Uh oh!

Prince781 commented Nov 15, 2024

Uh oh!

RKSimon commented Nov 16, 2024

Uh oh!

Prince781 commented Nov 16, 2024

Uh oh!

Prince781 commented Dec 1, 2024

Uh oh!

Prince781 commented Dec 1, 2024

Uh oh!

Prince781 commented Dec 9, 2024

Uh oh!

Prince781 commented Dec 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Prince781 commented Dec 16, 2024

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Prince781 commented Jan 23, 2025

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

nikic Jan 24, 2025

Choose a reason for hiding this comment

Uh oh!

Prince781 commented Jan 24, 2025

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Prince781 commented Jan 28, 2025

Uh oh!

Uh oh!

github-actions bot commented Nov 8, 2024 •

edited

Loading

Prince781 commented Dec 9, 2024 •

edited

Loading