Skip to content

Commit 87d7757

Browse files
committed
[SLP] Control maximum vectorization factor from TTI
D82227 has added a proper check to limit PHI vectorization to the maximum vector register size. That unfortunately resulted in at least a couple of regressions on SystemZ and x86. This change reverts PHI handling from D82227 and replaces it with a more general check in SLPVectorizerPass::tryToVectorizeList(). Moved to tryToVectorizeList() it allows to restart vectorization if initial chunk fails. However, this function is more general and handles not only PHI but everything which SLP handles. If vectorization factor would be limited to maximum vector register size it would limit much more vectorization than before leading to further regressions. Therefore a new TTI callback getMaximumVF() is added with the default 0 to preserve current behavior and limit nothing. Then targets can decide what is better for them. The callback gets ElementSize just like a similar getMinimumVF() function and the main opcode of the chain. The latter is to avoid regressions at least on the AMDGPU. We can have loads and stores up to 128 bit wide, and <2 x 16> bit vector math on some subtargets, where the rest shall not be vectorized. I.e. we need to differentiate based on the element size and operation itself. Differential Revision: https://reviews.llvm.org/D92059
1 parent c21df2a commit 87d7757

File tree

9 files changed

+95
-37
lines changed

9 files changed

+95
-37
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,11 @@ class TargetTransformInfo {
941941
/// applies when shouldMaximizeVectorBandwidth returns true.
942942
unsigned getMinimumVF(unsigned ElemWidth) const;
943943

944+
/// \return The maximum vectorization factor for types of given element
945+
/// bit width and opcode, or 0 if there is no maximum VF.
946+
/// Currently only used by the SLP vectorizer.
947+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
948+
944949
/// \return True if it should be considered for address type promotion.
945950
/// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
946951
/// profitable without finding other extensions fed by the same input.
@@ -1498,6 +1503,7 @@ class TargetTransformInfo::Concept {
14981503
virtual unsigned getMinVectorRegisterBitWidth() = 0;
14991504
virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
15001505
virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
1506+
virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
15011507
virtual bool shouldConsiderAddressTypePromotion(
15021508
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
15031509
virtual unsigned getCacheLineSize() const = 0;
@@ -1917,6 +1923,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
19171923
unsigned getMinimumVF(unsigned ElemWidth) const override {
19181924
return Impl.getMinimumVF(ElemWidth);
19191925
}
1926+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
1927+
return Impl.getMaximumVF(ElemWidth, Opcode);
1928+
}
19201929
bool shouldConsiderAddressTypePromotion(
19211930
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
19221931
return Impl.shouldConsiderAddressTypePromotion(

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,8 @@ class TargetTransformInfoImplBase {
356356

357357
unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
358358

359+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }
360+
359361
bool
360362
shouldConsiderAddressTypePromotion(const Instruction &I,
361363
bool &AllowPromotionWithoutCommonHeader) {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,11 @@ unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {
635635
return TTIImpl->getMinimumVF(ElemWidth);
636636
}
637637

638+
unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
639+
unsigned Opcode) const {
640+
return TTIImpl->getMaximumVF(ElemWidth, Opcode);
641+
}
642+
638643
bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
639644
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
640645
return TTIImpl->shouldConsiderAddressTypePromotion(

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
288288
return 32;
289289
}
290290

291+
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
292+
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
293+
return 32 * 4 / ElemWidth;
294+
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
295+
}
296+
291297
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
292298
unsigned ChainSizeInBytes,
293299
VectorType *VecTy) const {

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
170170
unsigned getNumberOfRegisters(unsigned RCID) const;
171171
unsigned getRegisterBitWidth(bool Vector) const;
172172
unsigned getMinVectorRegisterBitWidth() const;
173+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
173174
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
174175
unsigned ChainSizeInBytes,
175176
VectorType *VecTy) const;

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ static cl::opt<int>
126126
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
127127
cl::desc("Attempt to vectorize for this register size in bits"));
128128

129+
static cl::opt<unsigned>
130+
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
131+
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
132+
129133
static cl::opt<int>
130134
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
131135
cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -741,6 +745,12 @@ class BoUpSLP {
741745
return MinVecRegSize;
742746
}
743747

748+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
749+
unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
750+
MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
751+
return MaxVF ? MaxVF : UINT_MAX;
752+
}
753+
744754
/// Check if homogeneous aggregate is isomorphic to some VectorType.
745755
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
746756
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -6191,6 +6201,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
61916201
unsigned Sz = R.getVectorElementSize(I0);
61926202
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
61936203
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
6204+
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
61946205
if (MaxVF < 2) {
61956206
R.getORE()->emit([&]() {
61966207
return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -7633,7 +7644,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
76337644
bool Changed = false;
76347645
SmallVector<Value *, 4> Incoming;
76357646
SmallPtrSet<Value *, 16> VisitedInstrs;
7636-
unsigned MaxVecRegSize = R.getMaxVecRegSize();
76377647

76387648
bool HaveVectorizedPhiNodes = true;
76397649
while (HaveVectorizedPhiNodes) {
@@ -7660,27 +7670,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
76607670

76617671
// Look for the next elements with the same type.
76627672
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
7663-
Type *EltTy = (*IncIt)->getType();
7664-
7665-
assert(EltTy->isSized() &&
7666-
"Instructions should all be sized at this point");
7667-
TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
7668-
if (EltTS.isScalable()) {
7669-
// For now, just ignore vectorizing scalable types.
7670-
++IncIt;
7671-
continue;
7672-
}
7673-
7674-
unsigned EltSize = EltTS.getFixedSize();
7675-
unsigned MaxNumElts = MaxVecRegSize / EltSize;
7676-
if (MaxNumElts < 2) {
7677-
++IncIt;
7678-
continue;
7679-
}
7680-
76817673
while (SameTypeIt != E &&
7682-
(*SameTypeIt)->getType() == EltTy &&
7683-
static_cast<unsigned>(SameTypeIt - IncIt) < MaxNumElts) {
7674+
(*SameTypeIt)->getType() == (*IncIt)->getType()) {
76847675
VisitedInstrs.insert(*SameTypeIt);
76857676
++SameTypeIt;
76867677
}

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,18 @@ bb:
123123
ret <2 x i16> %ins.1
124124
}
125125

126-
; FIXME: Should not vectorize
127126
define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
128127
; GCN-LABEL: @uadd_sat_v2i32(
129128
; GCN-NEXT: bb:
130-
; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
131-
; GCN-NEXT: ret <2 x i32> [[TMP0]]
129+
; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
130+
; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
131+
; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
132+
; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
133+
; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
134+
; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
135+
; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
136+
; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
137+
; GCN-NEXT: ret <2 x i32> [[INS_1]]
132138
;
133139
bb:
134140
%arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -145,8 +151,15 @@ bb:
145151
define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
146152
; GCN-LABEL: @usub_sat_v2i32(
147153
; GCN-NEXT: bb:
148-
; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
149-
; GCN-NEXT: ret <2 x i32> [[TMP0]]
154+
; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
155+
; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
156+
; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
157+
; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
158+
; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
159+
; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
160+
; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
161+
; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
162+
; GCN-NEXT: ret <2 x i32> [[INS_1]]
150163
;
151164
bb:
152165
%arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -163,8 +176,15 @@ bb:
163176
define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
164177
; GCN-LABEL: @sadd_sat_v2i32(
165178
; GCN-NEXT: bb:
166-
; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
167-
; GCN-NEXT: ret <2 x i32> [[TMP0]]
179+
; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
180+
; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
181+
; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
182+
; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
183+
; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
184+
; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
185+
; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
186+
; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
187+
; GCN-NEXT: ret <2 x i32> [[INS_1]]
168188
;
169189
bb:
170190
%arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -181,8 +201,15 @@ bb:
181201
define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
182202
; GCN-LABEL: @ssub_sat_v2i32(
183203
; GCN-NEXT: bb:
184-
; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
185-
; GCN-NEXT: ret <2 x i32> [[TMP0]]
204+
; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
205+
; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
206+
; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
207+
; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
208+
; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
209+
; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
210+
; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
211+
; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
212+
; GCN-NEXT: ret <2 x i32> [[INS_1]]
186213
;
187214
bb:
188215
%arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -267,8 +294,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
267294
;
268295
; GFX8-LABEL: @uadd_sat_v4i16(
269296
; GFX8-NEXT: bb:
270-
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
271-
; GFX8-NEXT: ret <4 x i16> [[TMP0]]
297+
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
298+
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
299+
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
300+
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
301+
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
302+
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
303+
; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
304+
; GFX8-NEXT: ret <4 x i16> [[INS_3]]
272305
;
273306
bb:
274307
%arg0.0 = extractelement <4 x i16> %arg0, i64 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ bb:
1818
ret <2 x half> %tmp5
1919
}
2020

21-
; TODO: Should probably not really be vectorizing this
2221
; GCN-LABEL: @round_v2f32(
23-
; GCN: call <2 x float> @llvm.round.v2f32
22+
; GCN: call float @llvm.round.f32(
23+
; GCN: call float @llvm.round.f32(
2424
define <2 x float> @round_v2f32(<2 x float> %arg) {
2525
bb:
2626
%tmp = extractelement <2 x float> %arg, i64 0

llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s | FileCheck -check-prefix=MAX32 %s
3-
; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s | FileCheck -check-prefix=MAX256 %s
4-
; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s | FileCheck -check-prefix=MAX1024 %s
2+
; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | FileCheck -check-prefix=MAX32 %s
3+
; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s | FileCheck -check-prefix=MAX256 %s
4+
; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | FileCheck -check-prefix=MAX1024 %s
5+
; RUN: opt -slp-vectorizer -S < %s | FileCheck -check-prefix=MAX1024 %s
6+
7+
; Make sure we do not vectorize to create PHI wider than requested.
8+
; On AMDGPU target wider vectorization will result in a higher register pressure,
9+
; spilling, or even inability to allocate registers.
510

611
define void @phi_float32(half %hval, float %fval) {
712
; MAX32-LABEL: @phi_float32(
@@ -120,6 +125,7 @@ define void @phi_float32(half %hval, float %fval) {
120125
; MAX32-NEXT: [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
121126
; MAX32-NEXT: [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ]
122127
; MAX32-NEXT: [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ]
128+
; MAX32-NEXT: store float [[PHI31]], float* undef, align 4
123129
; MAX32-NEXT: ret void
124130
;
125131
; MAX256-LABEL: @phi_float32(
@@ -296,6 +302,8 @@ define void @phi_float32(half %hval, float %fval) {
296302
; MAX256-NEXT: [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ]
297303
; MAX256-NEXT: [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ]
298304
; MAX256-NEXT: [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ]
305+
; MAX256-NEXT: [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6
306+
; MAX256-NEXT: store float [[TMP157]], float* undef, align 4
299307
; MAX256-NEXT: ret void
300308
;
301309
; MAX1024-LABEL: @phi_float32(
@@ -481,6 +489,8 @@ define void @phi_float32(half %hval, float %fval) {
481489
; MAX1024-NEXT: br label [[BB2]]
482490
; MAX1024: bb2:
483491
; MAX1024-NEXT: [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ]
492+
; MAX1024-NEXT: [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30
493+
; MAX1024-NEXT: store float [[TMP166]], float* undef, align 4
484494
; MAX1024-NEXT: ret void
485495
;
486496
bb:
@@ -603,5 +613,6 @@ bb2:
603613
%phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]
604614
%phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ]
605615
%phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ]
616+
store float %phi31, float* undef
606617
ret void
607618
}

0 commit comments

Comments
 (0)