-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[ARM] Test unroll behaviour on machines with low overhead branching #118692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: None (VladiKrapp-Arm) ChangesAdd test for existing loop unroll behaviour. Full diff: https://github.com/llvm/llvm-project/pull/118692.diff 1 Files Affected:
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
new file mode 100644
index 00000000000000..b155f5d31045f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
@@ -0,0 +1,63 @@
+; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB
+
+; This test checks behaviour of loop unrolling on processors with low overhead branching available
+
+; LOB-CHECK-LABEL: for.body{{.*}}.prol
+; LOB-COUNT-1: fmul fast float
+; LOB-CHECK-LABEL: for.body{{.*}}.prol.1
+; LOB-COUNT-1: fmul fast float
+; LOB-CHECK-LABEL: for.body{{.*}}.prol.2
+; LOB-COUNT-1: fmul fast float
+; LOB-CHECK-LABEL: for.body{{.*}}
+; LOB-COUNT-4: fmul fast float
+; LOB-NOT: fmul fast float
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 {
+entry:
+ %cmp46 = icmp sgt i32 %n, 0
+ br i1 %cmp46, label %for.body, label %for.cond.cleanup
+
+for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body
+ %exitcond49.not = icmp eq i32 %add, %n
+ br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.loopexit, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.cond.loopexit
+ %k.047 = phi i32 [ %add, %for.cond.loopexit ], [ 0, %entry ]
+ %add = add nuw nsw i32 %k.047, 1
+ %cmp244 = icmp slt i32 %add, %n
+ br i1 %cmp244, label %for.cond6.preheader.lr.ph, label %for.cond.loopexit
+
+for.cond6.preheader.lr.ph: ; preds = %for.body
+ %invariant.gep = getelementptr float, ptr %pA, i32 %k.047
+ br label %for.cond6.preheader.us
+
+for.cond6.preheader.us: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.cond6.preheader.lr.ph
+ %w.045.us = phi i32 [ %add, %for.cond6.preheader.lr.ph ], [ %inc19.us, %for.cond6.for.cond.cleanup8_crit_edge.us ]
+ %mul.us = mul nuw nsw i32 %w.045.us, %n
+ %0 = getelementptr float, ptr %pA, i32 %mul.us
+ %arrayidx.us = getelementptr float, ptr %0, i32 %k.047
+ br label %for.body9.us
+
+for.body9.us: ; preds = %for.cond6.preheader.us, %for.body9.us
+ %x.043.us = phi i32 [ %add, %for.cond6.preheader.us ], [ %inc.us, %for.body9.us ]
+ %1 = load float, ptr %arrayidx.us, align 4
+ %mul11.us = mul nuw nsw i32 %x.043.us, %n
+ %gep.us = getelementptr float, ptr %invariant.gep, i32 %mul11.us
+ %2 = load float, ptr %gep.us, align 4
+ %mul14.us = fmul fast float %2, %1
+ %arrayidx17.us = getelementptr float, ptr %0, i32 %x.043.us
+ store float %mul14.us, ptr %arrayidx17.us, align 4
+ %inc.us = add nuw nsw i32 %x.043.us, 1
+ %exitcond.not = icmp eq i32 %inc.us, %n
+ br i1 %exitcond.not, label %for.cond6.for.cond.cleanup8_crit_edge.us, label %for.body9.us
+
+for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us
+ %inc19.us = add nuw nsw i32 %w.045.us, 1
+ %exitcond48.not = icmp eq i32 %inc19.us, %n
+ br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us
+}
+
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. You might someone else to also look at it though before merging
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd be a bit more descriptive on why you're putting this test in: you're planning to put up a patch for X that will change this behaviour in what way.
Current behaviour is the single loop with fmul gets runtime unrolled by count of 4, with the loop remainder unrolled as the 3 for.body9.us.prol sections. This is quite a lot of compare and branch, negating the benefits of the low overhead loop mechanism.
44d5944
to
e518dde
Compare
@stuij : I've updated the commit message (and the first comment on the PR) |
Add test for existing loop unroll behaviour.
Current behaviour is the single loop with fmul gets runtime unrolled by
count of 4, with the loop remainder unrolled as the 3 for.body9.us.prol
sections. This is quite a lot of compare and branch, negating the
benefits of the low overhead loop mechanism.