[InstCombine] Unpack scalable struct loads/stores. #123986

davemgreen · 2025-01-22T18:52:32Z

This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the struct geps are not supported and we canonicalize scalable geps to i8.

llvmbot · 2025-01-22T18:53:08Z

@llvm/pr-subscribers-llvm-transforms

Author: David Green (davemgreen)

Changes

This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the struct geps are not supported and we canonicalize scalable geps to i8.

Full diff: https://github.com/llvm/llvm-project/pull/123986.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp (+22-14)
(added) llvm/test/Transforms/InstCombine/split_scalable_struct.ll (+159)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 61f1c17592e966..89f70959119170 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -704,10 +704,6 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
     const DataLayout &DL = IC.getDataLayout();
     auto *SL = DL.getStructLayout(ST);
 
-    // Don't unpack for structure with scalable vector.
-    if (SL->getSizeInBits().isScalable())
-      return nullptr;
-
     if (SL->hasPadding())
       return nullptr;
 
@@ -722,11 +718,19 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices),
-                                               Name + ".elt");
+      auto *Ptr = !SL->getSizeInBits().isScalable()
+                      ? IC.Builder.CreateInBoundsGEP(
+                            ST, Addr, ArrayRef(Indices), Name + ".elt")
+                      : IC.Builder.CreateInBoundsPtrAdd(
+                            Addr,
+                            IC.Builder.CreateVScale(ConstantInt::get(
+                                DL.getIndexType(Addr->getType()),
+                                SL->getElementOffset(i).getKnownMinValue())),
+                            Name + ".elt");
       auto *L = IC.Builder.CreateAlignedLoad(
           ST->getElementType(i), Ptr,
-          commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
+          commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue()),
+          Name + ".unpack");
       // Propagate AA metadata. It'll still be valid on the narrowed load.
       L->setAAMetadata(LI.getAAMetadata());
       V = IC.Builder.CreateInsertValue(V, L, i);
@@ -1222,10 +1226,6 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
     const DataLayout &DL = IC.getDataLayout();
     auto *SL = DL.getStructLayout(ST);
 
-    // Don't unpack for structure with scalable vector.
-    if (SL->getSizeInBits().isScalable())
-      return false;
-
     if (SL->hasPadding())
       return false;
 
@@ -1244,10 +1244,18 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr =
-          IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices), AddrName);
+      auto *Ptr = !SL->getSizeInBits().isScalable()
+                      ? IC.Builder.CreateInBoundsGEP(
+                            ST, Addr, ArrayRef(Indices), AddrName)
+                      : IC.Builder.CreateInBoundsPtrAdd(
+                            Addr,
+                            IC.Builder.CreateVScale(ConstantInt::get(
+                                DL.getIndexType(Addr->getType()),
+                                SL->getElementOffset(i).getKnownMinValue())),
+                            AddrName);
       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
-      auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
+      auto EltAlign =
+          commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue());
       llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
       NS->setAAMetadata(SI.getAAMetadata());
     }
diff --git a/llvm/test/Transforms/InstCombine/split_scalable_struct.ll b/llvm/test/Transforms/InstCombine/split_scalable_struct.ll
new file mode 100644
index 00000000000000..bacb854d258b57
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/split_scalable_struct.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @split_load(ptr %p) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @split_load(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[R_ELT1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT:    [[R_UNPACK2:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[R_UNPACK2]], 1
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
+;
+entry:
+  %r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %r
+}
+
+define {<vscale x 16 x i8>} @split_load_one(ptr %p) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8> } @split_load_one(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    [[R1:%.*]] = insertvalue { <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT:    ret { <vscale x 16 x i8> } [[R1]]
+;
+entry:
+  %r = load {<vscale x 16 x i8>}, ptr %p
+  ret {<vscale x 16 x i8>} %r
+}
+
+define void @split_store({<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define void @split_store(
+; CHECK-SAME: { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
+  ret void
+}
+
+define void @split_store_one({<vscale x 4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define void @split_store_one(
+; CHECK-SAME: { <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP0]], ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  store {<vscale x 4 x i32>} %x, ptr %p
+  ret void
+}
+
+define {<16 x i8>, <16 x i8>} @check_v16i8_v4i32({<4 x i32>, <4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define { <16 x i8>, <16 x i8> } @check_v16i8_v4i32(
+; CHECK-SAME: { <4 x i32>, <4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[X]], 0
+; CHECK-NEXT:    store <4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
+; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[X]], 1
+; CHECK-NEXT:    store <4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT:    [[R_UNPACK_CAST:%.*]] = bitcast <4 x i32> [[X_ELT]] to <16 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <16 x i8>, <16 x i8> } poison, <16 x i8> [[R_UNPACK_CAST]], 0
+; CHECK-NEXT:    [[R_UNPACK4_CAST:%.*]] = bitcast <4 x i32> [[X_ELT2]] to <16 x i8>
+; CHECK-NEXT:    [[R5:%.*]] = insertvalue { <16 x i8>, <16 x i8> } [[TMP0]], <16 x i8> [[R_UNPACK4_CAST]], 1
+; CHECK-NEXT:    ret { <16 x i8>, <16 x i8> } [[R5]]
+;
+entry:
+  store {<4 x i32>, <4 x i32>} %x, ptr %p
+  %r = load {<16 x i8>, <16 x i8>}, ptr %p
+  ret {<16 x i8>, <16 x i8>} %r
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @check_nxv16i8_nxv4i32({<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @check_nxv16i8_nxv4i32(
+; CHECK-SAME: { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT:    [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
+;
+entry:
+  store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
+  %r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %r
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @alloca_nxv16i8_nxv4i32({<vscale x 4 x i32>, <vscale x 4 x i32>} %x) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @alloca_nxv16i8_nxv4i32(
+; CHECK-SAME: { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
+; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT:    [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
+;
+entry:
+  %p = alloca {<vscale x 4 x i32>, <vscale x 4 x i32>}
+  store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
+  %r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %r
+}
+
+define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr %p) {
+; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes(
+; CHECK-SAME: { <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]])
+; CHECK-NEXT:    store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]])
+; CHECK-NEXT:    ret { <16 x i8>, <32 x i8> } [[TMP0]]
+;
+entry:
+  call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5
+  store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16
+  %2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16
+  call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5
+  ret { <16 x i8>, <32 x i8> } %2
+}

nikic · 2025-01-22T19:51:39Z

llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

+                            Addr,
+                            IC.Builder.CreateVScale(ConstantInt::get(
+                                DL.getIndexType(Addr->getType()),
+                                SL->getElementOffset(i).getKnownMinValue())),


Use IRBuilder::CreateTypeSize() and then you can use the same code for scalable and not?

llvm/test/Transforms/InstCombine/split_scalable_struct.ll

nikic

LGTM, thanks!

This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the struct geps are not supported and we canonicalize scalable geps to i8.

davemgreen · 2025-01-23T18:00:57Z

Thanks

davemgreen requested review from nikic and efriedma-quic January 22, 2025 18:52

llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Jan 22, 2025

nikic reviewed Jan 22, 2025

View reviewed changes

davemgreen force-pushed the gh-ic-splitscalablestruct branch from 20c8e7e to 30471d2 Compare January 23, 2025 07:54

nikic approved these changes Jan 23, 2025

View reviewed changes

davemgreen force-pushed the gh-ic-splitscalablestruct branch from 30471d2 to 362fb8c Compare January 23, 2025 18:00

davemgreen merged commit bec4c7f into llvm:main Jan 23, 2025
5 of 6 checks passed

davemgreen deleted the gh-ic-splitscalablestruct branch January 23, 2025 18:04

davemgreen mentioned this pull request Jan 23, 2025

[InstCombine] Allow load to store forwarding for scalable structs #123908

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[InstCombine] Unpack scalable struct loads/stores. #123986

[InstCombine] Unpack scalable struct loads/stores. #123986

davemgreen commented Jan 22, 2025

Uh oh!

llvmbot commented Jan 22, 2025

Uh oh!

nikic Jan 22, 2025

Uh oh!

Uh oh!

nikic left a comment

Uh oh!

davemgreen commented Jan 23, 2025

Uh oh!

Uh oh!

Uh oh!

[InstCombine] Unpack scalable struct loads/stores. #123986

[InstCombine] Unpack scalable struct loads/stores. #123986

Conversation

davemgreen commented Jan 22, 2025

Uh oh!

llvmbot commented Jan 22, 2025

Uh oh!

nikic Jan 22, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

davemgreen commented Jan 23, 2025

Uh oh!

Uh oh!

Uh oh!