[IAI,LV] Avoid creating a scalar epilogue due to gaps in interleave-groups when

dnuzman · dnuzman · commit c7a8ddb84930 · 2018-10-22T06:17:09.000Z
optimizing for size LV is careful to respect -Os and not to create a scalar epilog in all cases (runtime tests, trip-counts that require a remainder loop) except for peeling due to gaps in interleave-groups. This patch fixes that; -Os will now have us invalidate such interleave-groups and vectorize without an epilog. The patch also removes a related FIXME comment that is now obsolete, and was also inaccurate: "FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a smaller MaxVF that does not require a scalar epilog." (requiresScalarEpilog() has nothing to do with VF). Reviewers: Ayal, hsaito, dcaballe, fhahn Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53420 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344883 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
@@ -308,6 +308,23 @@ class InterleaveGroup {
     propagateMetadata(NewInst, VL);
   }
 
+  /// Returns true if this Group requires a scalar iteration to handle gaps.
+  bool requiresScalarEpilogue() const {
+    // If Group has no gaps, or has gaps but the last member exists, then a
+    // scalar epilog is not needed for this group.
+    if (getNumMembers() == getFactor() || getMember(getFactor() - 1))
+      return false;
+
+    // We have a group with gaps. It therefore cannot be a group of stores,
+    // and it can't be a reversed access, because such groups get invalidated.
+    assert(!getMember(0)->mayWriteToMemory() &&
+           "Group should have been invalidated");
+    assert(!isReverse() && "Group should have been invalidated");
+
+    // This is a group of loads, with gaps, and without a last-member
+    return true;
+  }
+
 private:
   unsigned Factor; // Interleave Factor.
   bool Reverse;
@@ -388,6 +405,11 @@ class InterleavedAccessInfo {
   /// out-of-bounds requires a scalar epilogue iteration for correctness.
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 
+  /// Invalidate groups that require a scalar epilogue (due to gaps). This can
+  /// happen when we optimize for size and don't allow creating a scalar
+  /// epilogue.
+  void invalidateGroupsRequiringScalarEpilogue();
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
@@ -919,3 +919,27 @@ void InterleavedAccessInfo::analyzeInterleaving(
     }
   }
 }
+
+void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
+  // If no group had triggered the requirement to create an epilogue loop,
+  // there is nothing to do.
+  if (!requiresScalarEpilogue())
+    return;
+
+  // Avoid releasing a Group twice.
+  SmallPtrSet<InterleaveGroup *, 4> DelSet;
+  for (auto &I : InterleaveGroupMap) {
+    InterleaveGroup *Group = I.second;
+    if (Group->requiresScalarEpilogue())
+      DelSet.insert(Group);
+  }
+  for (auto *Ptr : DelSet) {
+    LLVM_DEBUG(
+        dbgs() 
+        << "LV: Invalidate candidate interleaved group due to gaps that "
+           "require a scalar epilogue.\n");
+    releaseGroup(Ptr);
+  }
+
+  RequiresScalarEpilogue = false;
+}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4599,6 +4599,14 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
     return None;
   }
 
+  // Record that scalar epilogue is not allowed.
+  LLVM_DEBUG(dbgs() << "LV: Not inserting scalar epilogue for access with gaps "
+                       "due to -Os/-Oz.\n");
+
+  // We don't create an epilogue when optimizing for size.
+  // Invalidate interleave groups that require an epilogue.
+  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+
   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
 
   if (TC > 0 && TC % MaxVF == 0) {
@@ -4610,8 +4618,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
-  //        smaller MaxVF that does not require a scalar epilog.
   if (Legal->canFoldTailByMasking()) {
     FoldTailByMasking = true;
     return MaxVF;
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
@@ -9,9 +9,13 @@ target triple = "i386-unknown-linux-gnu"
 ; interleaved-group but rather as a scalarized accesses.
 ; (For SKX, Gather is not supported by the compiler for chars, therefore
 ;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
 ; When  masked-interleave-group is enabled we expect to find the proper mask
 ; shuffling code, feeding the wide masked load for an interleave-group (with
 ; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
 ;
 ; void masked_strided1(const unsigned char* restrict p,
 ;                      unsigned char* restrict q,
@@ -38,6 +42,8 @@ target triple = "i386-unknown-linux-gnu"
 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
 
 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
 ;ENABLED_MASKED_STRIDED: vector.body:
@@ -47,6 +53,7 @@ target triple = "i386-unknown-linux-gnu"
 ;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
 ;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
 
 define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
 entry:
@@ -75,6 +82,109 @@ for.end:
   ret void
 }
 
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; the interleave-group is invalidated because is has gaps, so we end up
+; scalarizing.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
+;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
+;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED-NOT: for.body:
+;ENABLED_MASKED_STRIDED:     for.end:
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Same, but the load/store are not predicated. The interleave-group is
+; invalidated here as well because we have gaps and we can't create an epilog.
+; The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; Since enable-masked-interleaved-accesses currently only affects predicated
+; accesses, the behavior is the same with this switch set/unset.
+
+
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;ENABLED_MASKED_STRIDED-NOT: for.body:
+;ENABLED_MASKED_STRIDED:     for.end:
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
 ; Check also a scenario with full interleave-groups (no gaps) as well as both
 ; load and store groups. We check that when masked-interleave-group is disabled
 ; the predicated loads (and stores) are not vectorized as an