Skip to content

Commit bec4c7f

Browse files
authored
[InstCombine] Unpack scalable struct loads/stores. (#123986)
This teaches unpackLoadToAggregate and unpackStoreToAggregate to unpack scalable structs to individual loads/stores with insertvalues / extractvalues. The gep used for the offsets uses an i8 ptradd as opposed to a struct gep, as the geps for scalable structs are not supported and we canonicalize to i8.
1 parent 2f6b0b4 commit bec4c7f

File tree

2 files changed

+67
-41
lines changed

2 files changed

+67
-41
lines changed

llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -704,29 +704,22 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
704704
const DataLayout &DL = IC.getDataLayout();
705705
auto *SL = DL.getStructLayout(ST);
706706

707-
// Don't unpack for structure with scalable vector.
708-
if (SL->getSizeInBits().isScalable())
709-
return nullptr;
710-
711707
if (SL->hasPadding())
712708
return nullptr;
713709

714710
const auto Align = LI.getAlign();
715711
auto *Addr = LI.getPointerOperand();
716-
auto *IdxType = Type::getInt32Ty(T->getContext());
717-
auto *Zero = ConstantInt::get(IdxType, 0);
712+
auto *IdxType = DL.getIndexType(Addr->getType());
718713

719714
Value *V = PoisonValue::get(T);
720715
for (unsigned i = 0; i < NumElements; i++) {
721-
Value *Indices[2] = {
722-
Zero,
723-
ConstantInt::get(IdxType, i),
724-
};
725-
auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices),
726-
Name + ".elt");
716+
auto *Ptr = IC.Builder.CreateInBoundsPtrAdd(
717+
Addr, IC.Builder.CreateTypeSize(IdxType, SL->getElementOffset(i)),
718+
Name + ".elt");
727719
auto *L = IC.Builder.CreateAlignedLoad(
728720
ST->getElementType(i), Ptr,
729-
commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
721+
commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue()),
722+
Name + ".unpack");
730723
// Propagate AA metadata. It'll still be valid on the narrowed load.
731724
L->setAAMetadata(LI.getAAMetadata());
732725
V = IC.Builder.CreateInsertValue(V, L, i);
@@ -1222,10 +1215,6 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
12221215
const DataLayout &DL = IC.getDataLayout();
12231216
auto *SL = DL.getStructLayout(ST);
12241217

1225-
// Don't unpack for structure with scalable vector.
1226-
if (SL->getSizeInBits().isScalable())
1227-
return false;
1228-
12291218
if (SL->hasPadding())
12301219
return false;
12311220

@@ -1237,17 +1226,14 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
12371226
SmallString<16> AddrName = Addr->getName();
12381227
AddrName += ".repack";
12391228

1240-
auto *IdxType = Type::getInt32Ty(ST->getContext());
1241-
auto *Zero = ConstantInt::get(IdxType, 0);
1229+
auto *IdxType = DL.getIndexType(Addr->getType());
12421230
for (unsigned i = 0; i < Count; i++) {
1243-
Value *Indices[2] = {
1244-
Zero,
1245-
ConstantInt::get(IdxType, i),
1246-
};
1247-
auto *Ptr =
1248-
IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices), AddrName);
1231+
auto *Ptr = IC.Builder.CreateInBoundsPtrAdd(
1232+
Addr, IC.Builder.CreateTypeSize(IdxType, SL->getElementOffset(i)),
1233+
AddrName);
12491234
auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1250-
auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
1235+
auto EltAlign =
1236+
commonAlignment(Align, SL->getElementOffset(i).getKnownMinValue());
12511237
llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
12521238
NS->setAAMetadata(SI.getAAMetadata());
12531239
}

llvm/test/Transforms/InstCombine/scalable-vector-struct.ll

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
define <vscale x 1 x i32> @load(ptr %x) {
77
; CHECK-LABEL: define <vscale x 1 x i32> @load
88
; CHECK-SAME: (ptr [[X:%.*]]) {
9-
; CHECK-NEXT: [[A:%.*]] = load [[STRUCT_TEST:%.*]], ptr [[X]], align 4
10-
; CHECK-NEXT: [[B:%.*]] = extractvalue [[STRUCT_TEST]] [[A]], 1
11-
; CHECK-NEXT: ret <vscale x 1 x i32> [[B]]
9+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
10+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2
11+
; CHECK-NEXT: [[A_ELT1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
12+
; CHECK-NEXT: [[A_UNPACK2:%.*]] = load <vscale x 1 x i32>, ptr [[A_ELT1]], align 4
13+
; CHECK-NEXT: ret <vscale x 1 x i32> [[A_UNPACK2]]
1214
;
1315
%a = load %struct.test, ptr %x
1416
%b = extractvalue %struct.test %a, 1
@@ -18,9 +20,11 @@ define <vscale x 1 x i32> @load(ptr %x) {
1820
define void @store(ptr %x, <vscale x 1 x i32> %y, <vscale x 1 x i32> %z) {
1921
; CHECK-LABEL: define void @store
2022
; CHECK-SAME: (ptr [[X:%.*]], <vscale x 1 x i32> [[Y:%.*]], <vscale x 1 x i32> [[Z:%.*]]) {
21-
; CHECK-NEXT: [[A:%.*]] = insertvalue [[STRUCT_TEST:%.*]] undef, <vscale x 1 x i32> [[Y]], 0
22-
; CHECK-NEXT: [[B:%.*]] = insertvalue [[STRUCT_TEST]] [[A]], <vscale x 1 x i32> [[Z]], 1
23-
; CHECK-NEXT: store [[STRUCT_TEST]] [[B]], ptr [[X]], align 4
23+
; CHECK-NEXT: store <vscale x 1 x i32> [[Y]], ptr [[X]], align 4
24+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
25+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2
26+
; CHECK-NEXT: [[X_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
27+
; CHECK-NEXT: store <vscale x 1 x i32> [[Z]], ptr [[X_REPACK1]], align 4
2428
; CHECK-NEXT: ret void
2529
;
2630
%a = insertvalue %struct.test undef, <vscale x 1 x i32> %y, 0
@@ -33,8 +37,14 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @split_load(ptr %p) nounwind {
3337
; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @split_load
3438
; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
3539
; CHECK-NEXT: entry:
36-
; CHECK-NEXT: [[R:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[P]], align 16
37-
; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
40+
; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
41+
; CHECK-NEXT: [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
42+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
43+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 4
44+
; CHECK-NEXT: [[R_ELT1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP2]]
45+
; CHECK-NEXT: [[R_UNPACK2:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT1]], align 16
46+
; CHECK-NEXT: [[R3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[R_UNPACK2]], 1
47+
; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R3]]
3848
;
3949
entry:
4050
%r = load {<vscale x 16 x i8>, <vscale x 16 x i8>}, ptr %p
@@ -58,7 +68,13 @@ define void @split_store({<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p) no
5868
; CHECK-LABEL: define void @split_store
5969
; CHECK-SAME: ({ <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
6070
; CHECK-NEXT: entry:
61-
; CHECK-NEXT: store { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], ptr [[P]], align 16
71+
; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
72+
; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
73+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
74+
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
75+
; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
76+
; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
77+
; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
6278
; CHECK-NEXT: ret void
6379
;
6480
entry:
@@ -104,9 +120,21 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @check_nxv16i8_nxv4i32({<vscale
104120
; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @check_nxv16i8_nxv4i32
105121
; CHECK-SAME: ({ <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
106122
; CHECK-NEXT: entry:
107-
; CHECK-NEXT: store { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], ptr [[P]], align 16
108-
; CHECK-NEXT: [[R:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[P]], align 16
109-
; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
123+
; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
124+
; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
125+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
126+
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
127+
; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
128+
; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
129+
; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
130+
; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
131+
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
132+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
133+
; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4
134+
; CHECK-NEXT: [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
135+
; CHECK-NEXT: [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
136+
; CHECK-NEXT: [[R5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
137+
; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R5]]
110138
;
111139
entry:
112140
store {<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p
@@ -119,9 +147,21 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @alloca_nxv16i8_nxv4i32({<vscale
119147
; CHECK-SAME: ({ <vscale x 4 x i32>, <vscale x 4 x i32> } [[X:%.*]]) #[[ATTR0]] {
120148
; CHECK-NEXT: entry:
121149
; CHECK-NEXT: [[P:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
122-
; CHECK-NEXT: store { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], ptr [[P]], align 16
123-
; CHECK-NEXT: [[R:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[P]], align 16
124-
; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R]]
150+
; CHECK-NEXT: [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
151+
; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
152+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
153+
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
154+
; CHECK-NEXT: [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
155+
; CHECK-NEXT: [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
156+
; CHECK-NEXT: store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
157+
; CHECK-NEXT: [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
158+
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
159+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
160+
; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4
161+
; CHECK-NEXT: [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
162+
; CHECK-NEXT: [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
163+
; CHECK-NEXT: [[R5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
164+
; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[R5]]
125165
;
126166
entry:
127167
%p = alloca {<vscale x 4 x i32>, <vscale x 4 x i32>}

0 commit comments

Comments
 (0)