Skip to content

Commit cecc0d2

Browse files
committed
[NewPM] Add an SROA pass after loop unroll
If there is a small local array accessed in a loop, SROA can't handle memory accesses with variant offset inside a loop, after the loop is fully unrolled, all memory accesses to the array are with fixed offset, so now they can be processed by SROA. But there is no more SROA passes after loop unroll. This patch add an SROA pass after loop unroll to handle this pattern. Differential Revision: https://reviews.llvm.org/D68593
1 parent 89b7f16 commit cecc0d2

File tree

4 files changed

+66
-0
lines changed

4 files changed

+66
-0
lines changed

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
490490
FPM.addPass(createFunctionToLoopPassAdaptor(
491491
std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
492492

493+
// Delete small array after loop unroll.
494+
FPM.addPass(SROA());
495+
493496
// Eliminate redundancies.
494497
if (Level != O1) {
495498
// These passes add substantial compile time so skip them at O1.

llvm/test/Other/new-pm-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@
179179
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
180180
; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
181181
; CHECK-O-NEXT: Finished Loop pass manager run.
182+
; CHECK-O-NEXT: Running pass: SROA on foo
182183
; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
183184
; CHECK-Os-NEXT: Running pass: GVN
184185
; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/new-pm-thinlto-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@
156156
; CHECK-O-NEXT: Running pass: LoopDeletionPass
157157
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
158158
; CHECK-O-NEXT: Finished Loop pass manager run.
159+
; CHECK-O-NEXT: Running pass: SROA on foo
159160
; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
160161
; CHECK-Os-NEXT: Running pass: GVN
161162
; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/unroll-sroa.ll

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
; RUN: opt -disable-verify -passes='default<O2>' -S < %s | FileCheck %s
2+
3+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
4+
target triple = "x86_64-unknown-linux-gnu"
5+
6+
; The local array %tmp can only be optimized away by sroa after loop unroll.
7+
8+
; CHECK-LABEL: define void @foo
9+
; CHECK-NOT: alloca
10+
; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64
11+
12+
; Function Attrs: nounwind uwtable
13+
define void @foo(i32* %a, i32* %b) {
14+
entry:
15+
%a.addr = alloca i32*, align 8
16+
%b.addr = alloca i32*, align 8
17+
%tmp = alloca [4 x float], align 16
18+
%i = alloca i32, align 4
19+
store i32* %a, i32** %a.addr, align 8
20+
store i32* %b, i32** %b.addr, align 8
21+
store i32 0, i32* %i, align 4
22+
br label %for.cond
23+
24+
for.cond: ; preds = %for.inc, %entry
25+
%iter2 = load i32, i32* %i, align 4
26+
%cmp = icmp slt i32 %iter2, 4
27+
br i1 %cmp, label %for.body, label %for.cond.cleanup
28+
29+
for.cond.cleanup: ; preds = %for.cond
30+
br label %for.end
31+
32+
for.body: ; preds = %for.cond
33+
%inptr = load i32*, i32** %a.addr, align 8
34+
%idx2 = load i32, i32* %i, align 4
35+
%idxprom = sext i32 %idx2 to i64
36+
%arrayidx = getelementptr inbounds i32, i32* %inptr, i64 %idxprom
37+
%val = load i32, i32* %arrayidx, align 4
38+
%conv = sitofp i32 %val to float
39+
%idx = load i32, i32* %i, align 4
40+
%idxprom1 = sext i32 %idx to i64
41+
%arrayidx2 = getelementptr inbounds [4 x float], [4 x float]* %tmp, i64 0, i64 %idxprom1
42+
store float %conv, float* %arrayidx2, align 4
43+
br label %for.inc
44+
45+
for.inc: ; preds = %for.body
46+
%iter = load i32, i32* %i, align 4
47+
%inc = add nsw i32 %iter, 1
48+
store i32 %inc, i32* %i, align 4
49+
br label %for.cond
50+
51+
for.end: ; preds = %for.cond.cleanup
52+
%dstptr = load i32*, i32** %b.addr, align 8
53+
%dst = bitcast i32* %dstptr to i8*
54+
%arraydecay = getelementptr inbounds [4 x float], [4 x float]* %tmp, i64 0, i64 0
55+
%src = bitcast float* %arraydecay to i8*
56+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %dst, i8* align 16 %src, i64 16, i1 false)
57+
ret void
58+
}
59+
60+
; Function Attrs: argmemonly nounwind willreturn
61+
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg)

0 commit comments

Comments
 (0)