[LoopVectorize] Add test case for minloc reduction #141556

madhur13490 · 2025-05-27T07:10:07Z

This patch adds a test case extracted from Polyhedron benchmark. https://fortran.uk/fortran-compiler-comparisons/the-polyhedron-solutions-benchmark-suite/

The test is specifically interesting for vectorizing min/max reduction pattern.

The actual FORTRAN code is

integer function minlst(ipos1, ipos2)
       integer, intent(in) :: ipos1, ipos2
       integer :: ipos
       minlst = ipos2
       do ipos = ipos2 - 1, ipos1, -1
           if (xxtrt(ipos) < xxtrt(minlst)) then
               minlst = ipos
           end if
       end do
   end function minlst

Patch to recognize and transform the pattern in LoopIdiomVectorize can be found at #144987

llvmbot · 2025-05-27T07:10:41Z

@llvm/pr-subscribers-llvm-transforms

Author: Madhur Amilkanthwar (madhur13490)

Changes

This patch adds a test case extracted from Polyhedron benchmark. https://fortran.uk/fortran-compiler-comparisons/the-polyhedron-solutions-benchmark-suite/

The test is specifically interesting for vectorizing min/max reduction pattern.

Full diff: https://github.com/llvm/llvm-project/pull/141556.diff

1 Files Affected:

(added) llvm/test/Transforms/LoopVectorize/last-min-index.ll (+84)

diff --git a/llvm/test/Transforms/LoopVectorize/last-min-index.ll b/llvm/test/Transforms/LoopVectorize/last-min-index.ll
new file mode 100644
index 0000000000000..f69145eafa74a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/last-min-index.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN
+
+; This test case is extracted from rnflow (fortran) benchmark in polyhedron benchmark suite.
+; The function minlst primarily takes two indices (i.e. range), scans backwards in the range
+; and returns the firstIV of the minimum value.
+
+define fastcc i32 @minlst(i32 %.0.val, i32 %.0.val1, ptr %.0.val3) {
+; CHECK-REV-MIN-LABEL: define internal fastcc i32 @_QFcptrf2Pminlst(
+; CHECK-REV-MIN-SAME: i32 [[DOT0_VAL:%.*]], i32 [[DOT0_VAL1:%.*]], ptr [[DOT0_VAL3:%.*]]) unnamed_addr {
+; CHECK-REV-MIN-NEXT:    [[TMP1:%.*]] = sext i32 [[DOT0_VAL]] to i64
+; CHECK-REV-MIN-NEXT:    [[TMP2:%.*]] = sub i32 0, [[DOT0_VAL1]]
+; CHECK-REV-MIN-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-REV-MIN-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP1]], [[TMP3]]
+; CHECK-REV-MIN-NEXT:    [[TMP5:%.*]] = sub nsw i64 0, [[TMP4]]
+; CHECK-REV-MIN-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[DOT0_VAL3]], i64 -8
+; CHECK-REV-MIN-NEXT:    [[INVARIANT_GEP5:%.*]] = getelementptr i8, ptr [[DOT0_VAL3]], i64 -4
+; CHECK-REV-MIN-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[TMP4]], 0
+; CHECK-REV-MIN-NEXT:    br i1 [[TMP6]], label %[[DOTLR_PH_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
+; CHECK-REV-MIN:       [[_LR_PH_PREHEADER:.*:]]
+; CHECK-REV-MIN-NEXT:    [[TMP7:%.*]] = sext i32 [[DOT0_VAL1]] to i64
+; CHECK-REV-MIN-NEXT:    br label %[[DOTLR_PH:.*]]
+; CHECK-REV-MIN:       [[_LR_PH:.*:]]
+; CHECK-REV-MIN-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP7]], %[[DOTLR_PH_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ]
+; CHECK-REV-MIN-NEXT:    [[TMP8:%.*]] = phi i64 [ [[TMP14:%.*]], %[[DOTLR_PH]] ], [ [[TMP5]], %[[DOTLR_PH_PREHEADER]] ]
+; CHECK-REV-MIN-NEXT:    [[DOT07:%.*]] = phi i32 [ [[DOT1:%.*]], %[[DOTLR_PH]] ], [ [[DOT0_VAL1]], %[[DOTLR_PH_PREHEADER]] ]
+; CHECK-REV-MIN-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-REV-MIN-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV]]
+; CHECK-REV-MIN-NEXT:    [[TMP9:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-REV-MIN-NEXT:    [[TMP10:%.*]] = sext i32 [[DOT07]] to i64
+; CHECK-REV-MIN-NEXT:    [[GEP6:%.*]] = getelementptr float, ptr [[INVARIANT_GEP5]], i64 [[TMP10]]
+; CHECK-REV-MIN-NEXT:    [[TMP11:%.*]] = load float, ptr [[GEP6]], align 4
+; CHECK-REV-MIN-NEXT:    [[TMP12:%.*]] = fcmp contract olt float [[TMP9]], [[TMP11]]
+; CHECK-REV-MIN-NEXT:    [[TMP13:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-REV-MIN-NEXT:    [[DOT1]] = select i1 [[TMP12]], i32 [[TMP13]], i32 [[DOT07]]
+; CHECK-REV-MIN-NEXT:    [[TMP14]] = add nsw i64 [[TMP8]], -1
+; CHECK-REV-MIN-NEXT:    [[TMP15:%.*]] = icmp sgt i64 [[TMP8]], 1
+; CHECK-REV-MIN-NEXT:    br i1 [[TMP15]], label %[[DOTLR_PH]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]]
+; CHECK-REV-MIN:       [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-REV-MIN-NEXT:    [[DOT1_LCSSA:%.*]] = phi i32 [ [[DOT1]], %[[DOTLR_PH]] ]
+; CHECK-REV-MIN-NEXT:    br [[DOT_CRIT_EDGE]]
+; CHECK-REV-MIN:       [[__CRIT_EDGE:.*:]]
+; CHECK-REV-MIN-NEXT:    [[DOT0_LCSSA:%.*]] = phi i32 [ [[DOT0_VAL1]], [[TMP0:%.*]] ], [ [[DOT1_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-REV-MIN-NEXT:    ret i32 [[DOT0_LCSSA]]
+;
+  %1 = sext i32 %.0.val to i64
+  %2 = sub i32 0, %.0.val1
+  %3 = sext i32 %2 to i64
+  %4 = add nsw i64 %1, %3
+  %5 = sub nsw i64 0, %4
+  %invariant.gep = getelementptr i8, ptr %.0.val3, i64 -8
+  %invariant.gep5 = getelementptr i8, ptr %.0.val3, i64 -4
+  %6 = icmp slt i64 %4, 0
+  br i1 %6, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %0
+  %7 = sext i32 %.0.val1 to i64
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %7, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]
+  %8 = phi i64 [ %14, %.lr.ph ], [ %5, %.lr.ph.preheader ]
+  %.07 = phi i32 [ %.1, %.lr.ph ], [ %.0.val1, %.lr.ph.preheader ]
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+  %9 = load float, ptr %gep, align 4
+  %10 = sext i32 %.07 to i64
+  %gep6 = getelementptr float, ptr %invariant.gep5, i64 %10
+  %11 = load float, ptr %gep6, align 4
+  %12 = fcmp contract olt float %9, %11
+  %13 = trunc nsw i64 %indvars.iv.next to i32
+  %.1 = select i1 %12, i32 %13, i32 %.07
+  %14 = add nsw i64 %8, -1
+  %15 = icmp sgt i64 %8, 1
+  br i1 %15, label %.lr.ph, label %._crit_edge.loopexit
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  %.1.lcssa = phi i32 [ %.1, %.lr.ph ]
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %0
+  %.0.lcssa = phi i32 [ %.0.val1, %0 ], [ %.1.lcssa, %._crit_edge.loopexit ]
+  ret i32 %.0.lcssa
+}

madhur13490 · 2025-05-27T07:12:06Z

Related PRs: #141431 and #141467.

fhahn

Thamks for adding the test. Left some initial comments

fhahn · 2025-05-27T10:07:42Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN


This will need something like -force-vector-width=4 to make sure this gets vectorized if possible.

Suggested change

; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN

; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s

fhahn · 2025-05-27T10:08:04Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %1 = sext i32 %.0.val to i64
+  %2 = sub i32 0, %.0.val1
+  %3 = sext i32 %2 to i64
+  %4 = add nsw i64 %1, %3
+  %5 = sub nsw i64 0, %4
+  %invariant.gep = getelementptr i8, ptr %.0.val3, i64 -8
+  %invariant.gep5 = getelementptr i8, ptr %.0.val3, i64 -4
+  %6 = icmp slt i64 %4, 0
+  br i1 %6, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %0
+  %7 = sext i32 %.0.val1 to i64
+  br label %.lr.ph


I think all this could be simplified?

fhahn · 2025-05-27T10:08:16Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %7 = sext i32 %.0.val1 to i64
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph


Suggested change

.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph

loop:

fhahn · 2025-05-27T10:08:25Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %7, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]


Suggested change

%indvars.iv = phi i64 [ %7, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]

%iv = phi i64 [ %7, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]

fhahn · 2025-05-27T10:09:26Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %.07 = phi i32 [ %.1, %.lr.ph ], [ %.0.val1, %.lr.ph.preheader ]
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+  %9 = load float, ptr %gep, align 4
+  %10 = sext i32 %.07 to i64
+  %gep6 = getelementptr float, ptr %invariant.gep5, i64 %10
+  %11 = load float, ptr %gep6, align 4
+  %12 = fcmp contract olt float %9, %11
+  %13 = trunc nsw i64 %indvars.iv.next to i32
+  %.1 = select i1 %12, i32 %13, i32 %.07
+  %14 = add nsw i64 %8, -1
+  %15 = icmp sgt i64 %8, 1


would be good to use more descriptive names for some of the values, to help read-ability

madhur13490 · 2025-05-27T12:32:40Z

@fhahn Thanks for the review. I have addressed comments, I hope that the latest revision covers all!

madhur13490 · 2025-05-28T11:24:27Z

@fhahn or @Mel-Chen Would any of your pending patches cause vectorization of this test case? (I am having LoopIdiomVectorize based patch ready, and I should be upstreaming that soon, but wanted to check once.)

Mel-Chen · 2025-05-29T07:26:08Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN


Test with more configurations.

-force-vector-width=4 -force-vector-interleave=1

-force-vector-width=4 -force-vector-interleave=2

-force-vector-width=1 -force-vector-interleave=4

Mel-Chen · 2025-05-29T07:29:14Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+; The function minlst primarily takes two indices (i.e. range), scans backwards in the range
+; and returns the firstIV of the minimum value.
+
+define fastcc i32 @minlst(i32 %first_index, i32 %last_index, ptr %array) {


Do we really need fastcc?

fastcc is fast calling convention—how does it help vectorization?

Mel-Chen · 2025-05-29T07:32:29Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %iv = phi i64 [ %last_index_sext, %loop.preheader ], [ %iv.next, %loop ]
+  %dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ]
+  %index = phi i32 [ %select, %loop ], [ %last_index, %loop.preheader ]


Align the format of phi nodes, either phi loop.preheader, loop or phi loop, loop.preheader.

Mel-Chen · 2025-05-29T07:34:45Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+._crit_edge.loopexit:                             ; preds = %loop
+  %select.lcssa = phi i32 [ %select, %loop ]
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %0
+  %last_index_ret = phi i32 [ %last_index, %entry ], [ %select.lcssa, %._crit_edge.loopexit ]
+  ret i32 %last_index_ret
+}


Merge ._crit_edge.loopexit and ._crit_edge into one basic block.

Mel-Chen · 2025-05-29T07:42:27Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %first_ptr = getelementptr i8, ptr %array, i64 -8
+  %second_ptr = getelementptr i8, ptr %array, i64 -4


Could %first_ptr and %second_ptr be passed as arguments directly?

No, I'd like to preserve the structure.

Why? Would passing them directly prevent this case from being vectorized?

This is the case of no vectorization. Passing %first_ptr as arguments should not cause vectorization, but as I said, the test is extracted from the pass pipeline and particularly from a pass before LoopIdiomVectorize. Passing them as parameters would not be what it is produced by the pass.

The actual FORTRAN code is

nteger function minlst(ipos1, ipos2) integer, intent(in) :: ipos1, ipos2 integer :: ipos minlst = ipos2 do ipos = ipos2 - 1, ipos1, -1 if (xxtrt(ipos) < xxtrt(minlst)) then minlst = ipos end if end do end function minlst

which does not receive address as a parameter (this the IR does not have)

The main part of the test is to check we can vectorize this particular idiom, which should be independent of anything frontend-specific.

You could add a separate more complex test in addition, but I don't see how `%early_exit_cond and the sign extends are relevant, IIUC they could just be passed as wider types.

I like the idea of having another complex test. So here is what I did in the latest revision.

Passed widened values and pointers as parameters.

Passed %early_exit_cond as a parameter.
These have simplified the test a lot.

Next, I moved the existing version to a new test called last-min-index-ftn.ll.

Please note, my follow-up patch is adding LoopIdiomVectorize based implementation based on the complex version. In the original fortran code, %early_exit_cond and pointers are NOT coming as parameters, thus I need the test to justify the idiom recognition in my follow-up patch.

Mel-Chen · 2025-05-29T08:12:28Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %iv = phi i64 [ %last_index_sext, %loop.preheader ], [ %iv.next, %loop ]
+  %dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ]


Do you really need two induction variable with same step?

They have the same step -1. Can’t they be merged into one?

Mel-Chen · 2025-05-29T08:27:23Z

Related PRs: #141431 and #141467.

No, I think this is #140451.
There is only one phi node involved the reduction.

madhur13490 · 2025-05-29T09:53:04Z

Hi @Mel-Chen Thanks for the review. I have addressed some of your comments. For others, I want to preserve the structure of the code generated by the Flang compiler. As said, this is extracted from Flang pipeline and particularly the IR is just before LoopIdiomVectorize pass, thus we have LCSSA structure preserved.

Mel-Chen · 2025-06-03T13:38:33Z

Hi @Mel-Chen Thanks for the review. I have addressed some of your comments. For others, I want to preserve the structure of the code generated by the Flang compiler. As said, this is extracted from Flang pipeline and particularly the IR is just before LoopIdiomVectorize pass, thus we have LCSSA structure preserved.

Generally, we hope test cases to be as precise and clean as possible. So unless the input IR not being in LCSSA form is the reason LoopVectorize or LoopIdiomVectorize doesn't work, it's better to clean up the test case.

madhur13490 · 2025-06-03T13:49:06Z

Cross-posting from Discourse: https://discourse.llvm.org/t/vectorizing-min-max-reduction-pattern/85766

madhur13490 · 2025-06-16T08:46:37Z

Gentle ping @Mel-Chen. I have made the changes except one on the passing pointers as args. Please have a look.

Mel-Chen · 2025-06-17T06:26:51Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %first_ptr = getelementptr i8, ptr %array, i64 -8
+  %second_ptr = getelementptr i8, ptr %array, i64 -4


Mel-Chen · 2025-06-17T06:48:29Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  br label %loop
+
+loop:                                           ; preds = %loop.preheader, %loop
+  %iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ]


nit

Suggested change

%iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ]

%iv = phi i64 [ %iv.next, %loop ], [ %last_index_sext, %loop.preheader ]

Mel-Chen · 2025-06-17T09:13:19Z

llvm/test/Transforms/LoopVectorize/last-min-index.ll

+  %index_sext = sext i32 %index to i64
+  %load2_ptr = getelementptr float, ptr %second_ptr, i64 %index_sext
+  %load2 = load float, ptr %load2_ptr, align 4


I suspect that this case might not be vectorizable by the vectorizer. The value of %load2 depends on the result of %index from the previous iteration. Would this dependency prevent %load2 from being directly widened into a vector load? Has this issue already been addressed?

Yes, this is not vectorizable by vectorizer and that is one of the reasons of adding this test.

I think there might have been a misunderstanding. What I meant was whether this case could potentially be vectorized in the future. The reason I brought it up is:

The value of %load2 depends on the result of %index from the previous iteration. Would this dependency prevent %load2 from being directly widened into a vector load?

This goes beyond just recurrence analysis.
By the way, is this intended as a pre-commit test case? If so, is there already a corresponding patch for it?

Yes, just created WIP PR #144987. The PR needs some cleanup so should be ready for review very soon.

OK, but PR #144987 is not a LoopVectorize change.

This case demonstrates where LV fails to perform vectorization; thus, I added it in LoopVectorize. I don't mind it moving to LoopIdiomVectorize, though, if we prefer.

This patch adds a test case extracted from Polyhedron benchmark. https://fortran.uk/fortran-compiler-comparisons/the-polyhedron-solutions-benchmark-suite/ The test is specifically interesting for vectorizing min/max reduction pattern.

This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark. Pre-commit test can be found as part of llvm#141556

david-arm · 2025-06-20T10:54:47Z

llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll

+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL1
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL2
+
+; This test case is extracted from rnflow (fortran) benchmark in polyhedron benchmark suite.


Is this testing a common pattern from Fortran, i.e. minloc that looks for the location of the minimum value in an array? I just want to understand if this is something very common to Fortran in general, or for one very specific case in a particular benchmark? If it's the former, then I think it makes sense to create more general, hand-written tests that demonstrate minloc in a variety of situations, i.e. different array types, etc.

david-arm · 2025-06-20T10:58:43Z

llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll

+  %iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ]
+  %dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ]
+  %index = phi i32 [ %select, %loop ], [ %last_index, %loop.preheader ]
+  %iv.next = add nsw i64 %iv, -1


Does minloc always end up generating a reverse-counting loop, or does Fortran also generate forward-counting loops as well?

Please see Fortran source code. The source has reverse-counting loop thus we it in IR; unfortunately we can't generalize this.

OK, then I think this test is in the wrong directory. If there is no intention of anyone doing work to loop-vectorise this in the near future then it shouldn't live here.

david-arm · 2025-06-20T11:16:39Z

llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll

+  %load1 = load float, ptr %load1_ptr, align 4
+  %index_sext = sext i32 %index to i64
+  %load2_ptr = getelementptr float, ptr %second_ptr, i64 %index_sext
+  %load2 = load float, ptr %load2_ptr, align 4


I agree with @Mel-Chen, this looks very difficult to loop-vectorise because the pointer depends upon a value loaded from the previous iteration. Surely, the root problem here is that LoadPRE in an earlier IR pass has failed to spot this recurrence and keep a copy of the last loaded value corresponding to array[%index] in a phi node? For example, if the IR looked more like this:

loop: ; preds = %loop.preheader, %loop %iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ] %dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ] %index = phi i32 [ %select, %loop ], [ %last_index, %loop.preheader ] %val_at_index = phi float [ %new_val_at_index, %loop ], [ ... ] %iv.next = add nsw i64 %iv, -1 %load1_ptr = getelementptr float, ptr %first_ptr, i64 %iv %load1 = load float, ptr %load1_ptr, align 4 %cmp = fcmp contract olt float %load1, %val_at_index %iv.next.trunc = trunc nsw i64 %iv.next to i32 %select = select i1 %cmp, i32 %iv.next.trunc, i32 %index %val_at_index = select i1 %cmp, %load1, %val_at_index %dec = add nsw i64 %dec_iv, -1 %loop_cond = icmp sgt i64 %dec_iv, 1 br i1 %loop_cond, label %loop, label %._crit_edge

not only would the scalar version be faster, but it might give the loop vectoriser a better chance of using existing min/max reduction infrastructure to vectorise it?

Perhaps I'm missing some critical limitation here, but it feels like the focus we should be trying to optimise the scalar IR before we attempt vectorisation. There are no stores in the loop and the bounds seem to be well-known.

Yes, you are right. My first attempt was to fix this in GVN. Indeed, GCC handles this in GVN pass. A bit more context, GCC employs very sophisticated GVN algorithm based on Availability and anticipability. LLVM's current implementation is not mature and is based on an older algorithm from literature. I had started Discourse post here for bigger context. https://discourse.llvm.org/t/newgvn-gvn-pre-plans/84746

Considering this, LoopIdiomVectorize pass seems most suitable to me. You can also find our previous discussion on Discourse here. https://discourse.llvm.org/t/vectorizing-min-max-reduction-pattern/85766

If the recurrence pattern can be detected/optimized in LoopIdiomRecognize, what prevents us to detect/optimize the scalar code in isolation?

Currently the limitation in GVN (if we go by what GCC does). GVN is GCC is much more sophisticated than what LLVM has and improving LLVM's GVN is a large effort. However, having said that, optimizing scalar code would lead to 10-12% gain - as we would eliminate just the PRE case, vectorization leads to 4x speedup on an average.

madhur13490 requested review from fhahn and Mel-Chen May 27, 2025 07:10

llvmbot added the llvm:transforms label May 27, 2025

fhahn reviewed May 27, 2025

View reviewed changes

Mel-Chen reviewed May 29, 2025

View reviewed changes

madhur13490 force-pushed the add_minloc_test2 branch from 936d6ca to 3cef654 Compare May 29, 2025 09:50

madhur13490 force-pushed the add_minloc_test2 branch from 245cd46 to b691b00 Compare June 11, 2025 06:14

Mel-Chen reviewed Jun 17, 2025

View reviewed changes

madhur13490 added 5 commits June 17, 2025 02:59

[LoopVectorize] Add test case for minloc reduction

b4ee218

This patch adds a test case extracted from Polyhedron benchmark. https://fortran.uk/fortran-compiler-comparisons/the-polyhedron-solutions-benchmark-suite/ The test is specifically interesting for vectorizing min/max reduction pattern.

Address review comments

b799f55

Address review comments

01dee72

Address review comments

77fe860

Address review comments and add complex version

fbf4ac5

madhur13490 force-pushed the add_minloc_test2 branch from b691b00 to fbf4ac5 Compare June 17, 2025 10:26

madhur13490 mentioned this pull request Jun 20, 2025

[WIP][LoopIdiomVectorize] Recognize and transform minidx pattern #144987

Draft

david-arm reviewed Jun 20, 2025

View reviewed changes

		@@ -0,0 +1,84 @@
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
		; RUN: opt -passes=loop-vectorize -S %s \| FileCheck %s --check-prefix=CHECK-REV-MIN

	; RUN: opt -passes=loop-vectorize -S %s \| FileCheck %s --check-prefix=CHECK-REV-MIN
	; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s \| FileCheck %s

	%indvars.iv = phi i64 [ %7, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]
	%iv = phi i64 [ %7, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]

		@@ -0,0 +1,86 @@
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
		; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s \| FileCheck %s --check-prefix=CHECK-REV-MIN

		%first_ptr = getelementptr i8, ptr %array, i64 -8
		%second_ptr = getelementptr i8, ptr %array, i64 -4

		%iv = phi i64 [ %last_index_sext, %loop.preheader ], [ %iv.next, %loop ]
		%dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ]

	%iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ]
	%iv = phi i64 [ %iv.next, %loop ], [ %last_index_sext, %loop.preheader ]

[LoopVectorize] Add test case for minloc reduction #141556

Are you sure you want to change the base?

[LoopVectorize] Add test case for minloc reduction #141556

Conversation

madhur13490 commented May 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 27, 2025

Uh oh!

madhur13490 commented May 27, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

madhur13490 commented May 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

madhur13490 commented May 28, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

madhur13490 Jun 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

madhur13490 Jun 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Mel-Chen Jun 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented May 29, 2025

Uh oh!

madhur13490 commented May 29, 2025

Uh oh!

Mel-Chen commented Jun 3, 2025

Uh oh!

madhur13490 commented Jun 3, 2025

Uh oh!

madhur13490 commented Jun 16, 2025

madhur13490 commented May 27, 2025 •

edited

Loading

madhur13490 commented May 27, 2025 •

edited

Loading

madhur13490 Jun 11, 2025 •

edited

Loading

madhur13490 Jun 17, 2025 •

edited

Loading

Mel-Chen Jun 3, 2025 •

edited

Loading

david-arm Jun 23, 2025 •

edited

Loading