-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[InstCombine] Unpack scalable struct loads/stores. #123986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesThis teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the struct geps are not supported and we canonicalize scalable geps to i8. Full diff: https://github.com/llvm/llvm-project/pull/123986.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 61f1c17592e966..89f70959119170 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -704,10 +704,6 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
const DataLayout &DL = IC.getDataLayout();
auto *SL = DL.getStructLayout(ST);
- // Don't unpack for structure with scalable vector.
- if (SL->getSizeInBits().isScalable())
- return nullptr;
-
if (SL->hasPadding())
return nullptr;
@@ -722,11 +718,19 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
Zero,
ConstantInt::get(IdxType, i),
};
- auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices),
- Name + ".elt");
+ auto *Ptr = !SL->getSizeInBits().isScalable()
+ ? IC.Builder.CreateInBoundsGEP(
+ ST, Addr, ArrayRef(Indices), Name + ".elt")
+ : IC.Builder.CreateInBoundsPtrAdd(
+ Addr,
+ IC.Builder.CreateVScale(ConstantInt::get(
+ DL.getIndexType(Addr->getType()),
+ SL->getElementOffset(i).getKnownMinValue())),
+ Name + ".elt");
auto *L = IC.Builder.CreateAlignedLoad(
ST->getElementType(i), Ptr,
- commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
+ commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue()),
+ Name + ".unpack");
// Propagate AA metadata. It'll still be valid on the narrowed load.
L->setAAMetadata(LI.getAAMetadata());
V = IC.Builder.CreateInsertValue(V, L, i);
@@ -1222,10 +1226,6 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
const DataLayout &DL = IC.getDataLayout();
auto *SL = DL.getStructLayout(ST);
- // Don't unpack for structure with scalable vector.
- if (SL->getSizeInBits().isScalable())
- return false;
-
if (SL->hasPadding())
return false;
@@ -1244,10 +1244,18 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
Zero,
ConstantInt::get(IdxType, i),
};
- auto *Ptr =
- IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices), AddrName);
+ auto *Ptr = !SL->getSizeInBits().isScalable()
+ ? IC.Builder.CreateInBoundsGEP(
+ ST, Addr, ArrayRef(Indices), AddrName)
+ : IC.Builder.CreateInBoundsPtrAdd(
+ Addr,
+ IC.Builder.CreateVScale(ConstantInt::get(
+ DL.getIndexType(Addr->getType()),
+ SL->getElementOffset(i).getKnownMinValue())),
+ AddrName);
auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
- auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
+ auto EltAlign =
+ commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue());
llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
NS->setAAMetadata(SI.getAAMetadata());
}
diff --git a/llvm/test/Transforms/InstCombine/split_scalable_struct.ll b/llvm/test/Transforms/InstCombine/split_scalable_struct.ll
new file mode 100644
index 00000000000000..bacb854d258b57
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/split_scalable_struct.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @split_load(ptr %p) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @split_load(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT: [[R_ELT1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: [[R_UNPACK2:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT1]], align 16
+; CHECK-NEXT: [[R:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[R_UNPACK2]], 1
+; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
+;
+entry:
+ %r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %r
+}
+
+define {<vscale x 16 x i8>} @split_load_one(ptr %p) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8> } @split_load_one(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT: [[R1:%.*]] = insertvalue { <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT: ret { <vscale x 16 x i8> } [[R1]]
+;
+entry:
+ %r = load {<vscale x 16 x i8>}, ptr %p
+ ret {<vscale x 16 x i8>} %r
+}
+
+define void @split_store({<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define void @split_store(
+; CHECK-SAME: { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
+; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT: ret void
+;
+entry:
+ store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
+ ret void
+}
+
+define void @split_store_one({<vscale x 4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define void @split_store_one(
+; CHECK-SAME: { <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT: store <vscale x 4 x i32> [[TMP0]], ptr [[P]], align 16
+; CHECK-NEXT: ret void
+;
+entry:
+ store {<vscale x 4 x i32>} %x, ptr %p
+ ret void
+}
+
+define {<16 x i8>, <16 x i8>} @check_v16i8_v4i32({<4 x i32>, <4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define { <16 x i8>, <16 x i8> } @check_v16i8_v4i32(
+; CHECK-SAME: { <4 x i32>, <4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[X]], 0
+; CHECK-NEXT: store <4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
+; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[X]], 1
+; CHECK-NEXT: store <4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT: [[R_UNPACK_CAST:%.*]] = bitcast <4 x i32> [[X_ELT]] to <16 x i8>
+; CHECK-NEXT: [[TMP0:%.*]] = insertvalue { <16 x i8>, <16 x i8> } poison, <16 x i8> [[R_UNPACK_CAST]], 0
+; CHECK-NEXT: [[R_UNPACK4_CAST:%.*]] = bitcast <4 x i32> [[X_ELT2]] to <16 x i8>
+; CHECK-NEXT: [[R5:%.*]] = insertvalue { <16 x i8>, <16 x i8> } [[TMP0]], <16 x i8> [[R_UNPACK4_CAST]], 1
+; CHECK-NEXT: ret { <16 x i8>, <16 x i8> } [[R5]]
+;
+entry:
+ store {<4 x i32>, <4 x i32>} %x, ptr %p
+ %r = load {<16 x i8>, <16 x i8>}, ptr %p
+ ret {<16 x i8>, <16 x i8>} %r
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @check_nxv16i8_nxv4i32({<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @check_nxv16i8_nxv4i32(
+; CHECK-SAME: { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
+; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT: [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT: [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
+; CHECK-NEXT: [[R:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
+; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
+;
+entry:
+ store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
+ %r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %r
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @alloca_nxv16i8_nxv4i32({<vscale x 4 x i32>, <vscale x 4 x i32>} %x) nounwind {
+; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @alloca_nxv16i8_nxv4i32(
+; CHECK-SAME: { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
+; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
+; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
+; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
+; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT: [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT: [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
+; CHECK-NEXT: [[R:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
+; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
+;
+entry:
+ %p = alloca {<vscale x 4 x i32>, <vscale x 4 x i32>}
+ store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
+ %r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %r
+}
+
+define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr %p) {
+; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes(
+; CHECK-SAME: { <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]])
+; CHECK-NEXT: store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]])
+; CHECK-NEXT: ret { <16 x i8>, <32 x i8> } [[TMP0]]
+;
+entry:
+ call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5
+ store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16
+ %2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16
+ call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5
+ ret { <16 x i8>, <32 x i8> } %2
+}
|
Addr, | ||
IC.Builder.CreateVScale(ConstantInt::get( | ||
DL.getIndexType(Addr->getType()), | ||
SL->getElementOffset(i).getKnownMinValue())), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use IRBuilder::CreateTypeSize()
and then you can use the same code for scalable and not?
20c8e7e
to
30471d2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the struct geps are not supported and we canonicalize scalable geps to i8.
30471d2
to
362fb8c
Compare
Thanks |
This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the struct geps are not supported and we canonicalize scalable geps to i8.