[SLP] Control maximum vectorization factor from TTI

rampitec · rampitec · commit 87d7757bbe14 · 2020-12-14T08:49:40.000-08:00
D82227 has added a proper check to limit PHI vectorization to the maximum vector register size. That unfortunately resulted in at least a couple of regressions on SystemZ and x86. This change reverts PHI handling from D82227 and replaces it with a more general check in SLPVectorizerPass::tryToVectorizeList(). Moved to tryToVectorizeList() it allows to restart vectorization if initial chunk fails. However, this function is more general and handles not only PHI but everything which SLP handles. If vectorization factor would be limited to maximum vector register size it would limit much more vectorization than before leading to further regressions. Therefore a new TTI callback getMaximumVF() is added with the default 0 to preserve current behavior and limit nothing. Then targets can decide what is better for them. The callback gets ElementSize just like a similar getMinimumVF() function and the main opcode of the chain. The latter is to avoid regressions at least on the AMDGPU. We can have loads and stores up to 128 bit wide, and <2 x 16> bit vector math on some subtargets, where the rest shall not be vectorized. I.e. we need to differentiate based on the element size and operation itself. Differential Revision: https://reviews.llvm.org/D92059
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -941,6 +941,11 @@ class TargetTransformInfo {
   /// applies when shouldMaximizeVectorBandwidth returns true.
   unsigned getMinimumVF(unsigned ElemWidth) const;
 
+  /// \return The maximum vectorization factor for types of given element
+  /// bit width and opcode, or 0 if there is no maximum VF.
+  /// Currently only used by the SLP vectorizer.
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -1498,6 +1503,7 @@ class TargetTransformInfo::Concept {
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
+  virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() const = 0;
@@ -1917,6 +1923,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMinimumVF(unsigned ElemWidth) const override {
     return Impl.getMinimumVF(ElemWidth);
   }
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
+    return Impl.getMaximumVF(ElemWidth, Opcode);
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -356,6 +356,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
 
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }
+
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -635,6 +635,11 @@ unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {
   return TTIImpl->getMinimumVF(ElemWidth);
 }
 
+unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
+                                           unsigned Opcode) const {
+  return TTIImpl->getMaximumVF(ElemWidth, Opcode);
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,6 +288,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
   return 32;
 }
 
+unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
+    return 32 * 4 / ElemWidth;
+  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+}
+
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                          unsigned ChainSizeInBytes,
                                          VectorType *VecTy) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -170,6 +170,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   unsigned getNumberOfRegisters(unsigned RCID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                unsigned ChainSizeInBytes,
                                VectorType *VecTy) const;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -126,6 +126,10 @@ static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
+static cl::opt<unsigned>
+MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+
 static cl::opt<int>
 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
     cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -741,6 +745,12 @@ class BoUpSLP {
     return MinVecRegSize;
   }
 
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
+      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+    return MaxVF ? MaxVF : UINT_MAX;
+  }
+
   /// Check if homogeneous aggregate is isomorphic to some VectorType.
   /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
   /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -6191,6 +6201,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -7633,7 +7644,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
-  unsigned MaxVecRegSize = R.getMaxVecRegSize();
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -7660,27 +7670,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
-      Type *EltTy = (*IncIt)->getType();
-
-      assert(EltTy->isSized() &&
-             "Instructions should all be sized at this point");
-      TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
-      if (EltTS.isScalable()) {
-        // For now, just ignore vectorizing scalable types.
-        ++IncIt;
-        continue;
-      }
-
-      unsigned EltSize = EltTS.getFixedSize();
-      unsigned MaxNumElts = MaxVecRegSize / EltSize;
-      if (MaxNumElts < 2) {
-        ++IncIt;
-        continue;
-      }
-
       while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == EltTy &&
-             static_cast<unsigned>(SameTypeIt - IncIt) < MaxNumElts) {
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -123,12 +123,18 @@ bb:
   ret <2 x i16> %ins.1
 }
 
-; FIXME: Should not vectorize
 define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @uadd_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -145,8 +151,15 @@ bb:
 define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @usub_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -163,8 +176,15 @@ bb:
 define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @sadd_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -181,8 +201,15 @@ bb:
 define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @ssub_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -267,8 +294,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <4 x i16> [[TMP0]]
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
@@ -18,9 +18,9 @@ bb:
   ret <2 x half> %tmp5
 }
 
-; TODO: Should probably not really be vectorizing this
 ; GCN-LABEL: @round_v2f32(
-; GCN: call <2 x float> @llvm.round.v2f32
+; GCN: call float @llvm.round.f32(
+; GCN: call float @llvm.round.f32(
 define <2 x float> @round_v2f32(<2 x float> %arg) {
 bb:
   %tmp = extractelement <2 x float> %arg, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
@@ -1,7 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s | FileCheck -check-prefix=MAX32 %s
-; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s | FileCheck -check-prefix=MAX256 %s
-; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s | FileCheck -check-prefix=MAX1024 %s
+; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | FileCheck -check-prefix=MAX32 %s
+; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s | FileCheck -check-prefix=MAX256 %s
+; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | FileCheck -check-prefix=MAX1024 %s
+; RUN: opt -slp-vectorizer -S < %s | FileCheck -check-prefix=MAX1024 %s
+
+; Make sure we do not vectorize to create PHI wider than requested.
+; On AMDGPU target wider vectorization will result in a higher register pressure,
+; spilling, or even inability to allocate registers.
 
 define void @phi_float32(half %hval, float %fval) {
 ; MAX32-LABEL: @phi_float32(
@@ -120,6 +125,7 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX32-NEXT:    [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
 ; MAX32-NEXT:    [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ]
 ; MAX32-NEXT:    [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ]
+; MAX32-NEXT:    store float [[PHI31]], float* undef, align 4
 ; MAX32-NEXT:    ret void
 ;
 ; MAX256-LABEL: @phi_float32(
@@ -296,6 +302,8 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX256-NEXT:    [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ]
 ; MAX256-NEXT:    [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ]
 ; MAX256-NEXT:    [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6
+; MAX256-NEXT:    store float [[TMP157]], float* undef, align 4
 ; MAX256-NEXT:    ret void
 ;
 ; MAX1024-LABEL: @phi_float32(
@@ -481,6 +489,8 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb2:
 ; MAX1024-NEXT:    [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30
+; MAX1024-NEXT:    store float [[TMP166]], float* undef, align 4
 ; MAX1024-NEXT:    ret void
 ;
 bb:
@@ -603,5 +613,6 @@ bb2:
   %phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]
   %phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ]
   %phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ]
+  store float %phi31, float* undef
   ret void
 }