[LV] Disable fold tail by masking - when induction vars used outside #81609

niwinanto · 2024-02-13T14:45:35Z

When induction variable are used outside the loop body, tail folding
by masking mis-compiles, because for users outside of the loop the
final value of the induction is computed separately from the vector
loop.

Fixes #76069
Fixes #51677

github-actions · 2024-02-13T14:45:52Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be
notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write
permissions for the repository. In which case you can instead tag reviewers by
name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review
by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate
is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2024-02-13T14:46:23Z

@llvm/pr-subscribers-llvm-transforms

Author: Niwin Anto (niwinanto)

Changes

When induction variable are used outside the loop body, tail folding by masking mis-compiles.
#76069

Full diff: https://github.com/llvm/llvm-project/pull/81609.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+13)
(added) llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll (+85)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 37a356c43e29a4..d33743e74cbe31 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1552,6 +1552,19 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
     }
   }
 
+  for (const auto &Entry : getInductionVars()) {
+    PHINode *OrigPhi = Entry.first;
+    for (User *U : OrigPhi->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (!TheLoop->contains(UI)) {
+        LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
+                             "outside user for "
+                          << *UI << "\n");
+        return false;
+      }
+    }
+  }
+
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
new file mode 100644
index 00000000000000..f7379df934bd77
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+
+
+; #include <stdio.h>
+; #define SIZE 17
+;
+; unsigned char result;
+; unsigned char arr_1[SIZE];
+;
+; __attribute__((__noinline__))
+; void test(int limit, unsigned char val, int arr_2[SIZE][SIZE][SIZE]) {
+;     #pragma clang loop vectorize_predicate(enable)
+;     for (short i_5 = 0; i_5 < limit; i_5++) {
+;         arr_1 [i_5] = val;
+;         result = arr_2[0][0][i_5] != arr_2[i_5][i_5][0];
+;     }
+; }
+;
+;int main(void) {
+;  int arr_2[SIZE][SIZE][SIZE];
+;
+;  __builtin_memset(arr_2, 1, sizeof(arr_2));
+;
+;  test(SIZE, 0, arr_2);
+;  printf("%hu \n", result);
+;}
+; clang miss-compiles the above code
+; with vectorize_predicate(enable), result is 0 and 1 without.
+
+
+@result = global i8 0, align 1
+@arr_17 = global [17 x i8] zeroinitializer, align 1
+@a = external global i8, align 1
+
+define void @test(i32 %limit, i8 zeroext %val, ptr readonly %arr_14)   {
+; CHECK-LABEL: @test(
+; CHECK-NOT:       pred.store.if:
+; CHECK-NOT:       pred.store.continue:
+;
+entry:
+  %cmp18 = icmp sgt i32 %limit, 0
+  br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
+  %conv20.lcssa = phi i32 [ %conv20, %for.body ]
+  %arrayidx4 = getelementptr inbounds [17 x i32], ptr %arr_14, i32 0, i32 %conv20.lcssa
+  %0 = load i32, ptr %arrayidx4, align 4, !tbaa !4
+  %arrayidx8 = getelementptr inbounds [17 x [17 x i32]], ptr %arr_14, i32 %conv20.lcssa, i32 %conv20.lcssa
+  %1 = load i32, ptr %arrayidx8, align 4, !tbaa !4
+  %cmp10 = icmp ne i32 %0, %1
+  %conv11 = zext i1 %cmp10 to i8
+  store i8 %conv11, ptr @result, align 1, !tbaa !8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %conv20 = phi i32 [ %conv, %for.body ], [ 0, %for.body.preheader ]
+  %i_5.019 = phi i16 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [17 x i8], ptr @arr_17, i32 0, i32 %conv20
+  store i8 %val, ptr %arrayidx, align 1, !tbaa !8
+  %inc = add i16 %i_5.019, 1
+  %conv = sext i16 %inc to i32
+  %cmp = icmp slt i32 %conv, %limit
+  br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge, !llvm.loop !9
+}
+
+
+
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!6, !6, i64 0}
+!9 = distinct !{!9, !10, !11, !12, !13, !14}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!12 = !{!"llvm.loop.vectorize.width", i32 2}
+!13 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!14 = !{!"llvm.loop.vectorize.enable", i1 true}

fhahn

Thanks for the patch!

Could you add the test as a separate PR (with a FIXME); this patch then just adjust the test and the diff shows the change in the test only.

Previously there was a patch shared here https://reviews.llvm.org/D115109 by @rickyz (hope it's the same as on Phabricator) but the patch never got pushed through. Would be good to look at the comments and potentially pick it up

fhahn · 2024-02-13T17:51:52Z