Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 78dd298

Browse files
committed
[SCEV] Add zext(C + x + ...) -> D + zext(C-D + x + ...)<nuw><nsw> transform
if the top level addition in (D + (C-D + x + ...)) could be proven to not wrap, where the choice of D also maximizes the number of trailing zeroes of (C-D + x + ...), ensuring homogeneous behaviour of the transformation and better canonicalization of such expressions. This enables better canonicalization of expressions like 1 + zext(5 + 20 * %x + 24 * %y) and zext(6 + 20 * %x + 24 * %y) which get both transformed to 2 + zext(4 + 20 * %x + 24 * %y) This pattern is common in address arithmetics and the transformation makes it easier for passes like LoadStoreVectorizer to prove that 2 or more memory accesses are consecutive and optimize (vectorize) them. Reviewed By: mzolotukhin Differential Revision: https://reviews.llvm.org/D48853 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337859 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent a783e1f commit 78dd298

File tree

3 files changed

+197
-0
lines changed

3 files changed

+197
-0
lines changed

lib/Analysis/ScalarEvolution.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,44 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
17771777
Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
17781778
return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1);
17791779
}
1780+
1781+
// zext(C + x + y + ...) --> (zext(D) + zext((C - D) + x + y + ...))<nuw>
1782+
// if D + (C - D + x + y + ...) could be proven to not unsigned wrap
1783+
// where D maximizes the number of trailing zeros of (C - D + x + y + ...)
1784+
//
1785+
// Useful while proving that address arithmetic expressions are equal or
1786+
// differ by a small constant amount, see LoadStoreVectorizer pass.
1787+
if (const auto *SC = dyn_cast<SCEVConstant>(SA->getOperand(0))) {
1788+
// Often address arithmetics contain expressions like
1789+
// (zext (add (shl X, C1), C2)), for instance, (zext (5 + (4 * X))).
1790+
// ConstantRange is unable to prove that it's possible to transform
1791+
// (5 + (4 * X)) to (1 + (4 + (4 * X))) w/o underflowing:
1792+
//
1793+
// | Expression | ConstantRange | KnownBits |
1794+
// |---------------|------------------------|-----------------------|
1795+
// | i8 4 * X | [L: 0, U: 253) | XXXX XX00 |
1796+
// | | => Min: 0, Max: 252 | => Min: 0, Max: 252 |
1797+
// | | | |
1798+
// | i8 4 * X + 5 | [L: 5, U: 2) (wrapped) | YYYY YY01 |
1799+
// | (101) | => Min: 0, Max: 255 | => Min: 1, Max: 253 |
1800+
//
1801+
// As KnownBits are not available for SCEV expressions, use number of
1802+
// trailing zeroes instead:
1803+
APInt C = SC->getAPInt();
1804+
uint32_t TZ = C.getBitWidth();
1805+
for (unsigned I = 1, E = SA->getNumOperands(); I < E && TZ; ++I)
1806+
TZ = std::min(TZ, GetMinTrailingZeros(SA->getOperand(I)));
1807+
if (TZ) {
1808+
APInt D = TZ < C.getBitWidth() ? C.trunc(TZ).zext(C.getBitWidth()) : C;
1809+
if (D != 0) {
1810+
const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth);
1811+
const SCEV *SResidual =
1812+
getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth);
1813+
const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1);
1814+
return getAddExpr(SZExtD, SZExtR, SCEV::FlagNUW, Depth + 1);
1815+
}
1816+
}
1817+
}
17801818
}
17811819

17821820
if (auto *SM = dyn_cast<SCEVMulExpr>(Op)) {

test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,84 @@ define void @f2(i8* %len_addr) {
120120

121121
ret void
122122
}
123+
124+
@z_addr = external global [16 x i8], align 4
125+
@z_addr_noalign = external global [16 x i8]
126+
127+
%union = type { [10 x [4 x float]] }
128+
@tmp_addr = external unnamed_addr global { %union, [2000 x i8] }
129+
130+
define void @f3(i8* %x_addr, i8* %y_addr, i32* %tmp_addr) {
131+
; CHECK-LABEL: Classifying expressions for: @f3
132+
entry:
133+
%x = load i8, i8* %x_addr
134+
%t0 = mul i8 %x, 4
135+
%t1 = add i8 %t0, 5
136+
%t1.zext = zext i8 %t1 to i16
137+
; CHECK: %t1.zext = zext i8 %t1 to i16
138+
; CHECK-NEXT: --> (1 + (zext i8 (4 + (4 * %x)) to i16))<nuw><nsw> U: [1,254) S: [1,257)
139+
140+
%q0 = mul i8 %x, 4
141+
%q1 = add i8 %q0, 7
142+
%q1.zext = zext i8 %q1 to i16
143+
; CHECK: %q1.zext = zext i8 %q1 to i16
144+
; CHECK-NEXT: --> (3 + (zext i8 (4 + (4 * %x)) to i16))<nuw><nsw> U: [3,256) S: [3,259)
145+
146+
%p0 = mul i8 %x, 4
147+
%p1 = add i8 %p0, 8
148+
%p1.zext = zext i8 %p1 to i16
149+
; CHECK: %p1.zext = zext i8 %p1 to i16
150+
; CHECK-NEXT: --> (zext i8 (8 + (4 * %x)) to i16) U: [0,253) S: [0,256)
151+
152+
%r0 = mul i8 %x, 4
153+
%r1 = add i8 %r0, 254
154+
%r1.zext = zext i8 %r1 to i16
155+
; CHECK: %r1.zext = zext i8 %r1 to i16
156+
; CHECK-NEXT: --> (2 + (zext i8 (-4 + (4 * %x)) to i16))<nuw><nsw> U: [2,255) S: [2,258)
157+
158+
%y = load i8, i8* %y_addr
159+
%s0 = mul i8 %x, 32
160+
%s1 = mul i8 %y, 36
161+
%s2 = add i8 %s0, %s1
162+
%s3 = add i8 %s2, 5
163+
%s3.zext = zext i8 %s3 to i16
164+
; CHECK: %s3.zext = zext i8 %s3 to i16
165+
; CHECK-NEXT: --> (1 + (zext i8 (4 + (32 * %x) + (36 * %y)) to i16))<nuw><nsw> U: [1,254) S: [1,257)
166+
167+
%ptr = bitcast [16 x i8]* @z_addr to i8*
168+
%int0 = ptrtoint i8* %ptr to i32
169+
%int5 = add i32 %int0, 5
170+
%int.zext = zext i32 %int5 to i64
171+
; CHECK: %int.zext = zext i32 %int5 to i64
172+
; CHECK-NEXT: --> (1 + (zext i32 (4 + %int0) to i64))<nuw><nsw> U: [1,4294967294) S: [1,4294967297)
173+
174+
%ptr_noalign = bitcast [16 x i8]* @z_addr_noalign to i8*
175+
%int0_na = ptrtoint i8* %ptr_noalign to i32
176+
%int5_na = add i32 %int0_na, 5
177+
%int.zext_na = zext i32 %int5_na to i64
178+
; CHECK: %int.zext_na = zext i32 %int5_na to i64
179+
; CHECK-NEXT: --> (zext i32 (5 + %int0_na) to i64) U: [0,4294967296) S: [0,4294967296)
180+
181+
%tmp = load i32, i32* %tmp_addr
182+
%mul = and i32 %tmp, -4
183+
%add4 = add i32 %mul, 4
184+
%add4.zext = zext i32 %add4 to i64
185+
%sunkaddr3 = mul i64 %add4.zext, 4
186+
%sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @tmp_addr to i8*), i64 %sunkaddr3
187+
%sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
188+
%addr4.cast = bitcast i8* %sunkaddr5 to i32*
189+
%addr4.incr = getelementptr i32, i32* %addr4.cast, i64 1
190+
; CHECK: %addr4.incr = getelementptr i32, i32* %addr4.cast, i64 1
191+
; CHECK-NEXT: --> ([[C:4100]] + ([[SIZE:4]] * (zext i32 ([[OFFSET:4]] + ([[STRIDE:4]] * (%tmp /u [[STRIDE]]))<nuw>) to i64))<nuw><nsw> + @tmp_addr)
192+
193+
%add5 = add i32 %mul, 5
194+
%add5.zext = zext i32 %add5 to i64
195+
%sunkaddr0 = mul i64 %add5.zext, 4
196+
%sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @tmp_addr to i8*), i64 %sunkaddr0
197+
%sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
198+
%addr5.cast = bitcast i8* %sunkaddr2 to i32*
199+
; CHECK: %addr5.cast = bitcast i8* %sunkaddr2 to i32*
200+
; CHECK-NEXT: --> ([[C]] + ([[SIZE]] * (zext i32 ([[OFFSET]] + ([[STRIDE]] * (%tmp /u [[STRIDE]]))<nuw>) to i64))<nuw><nsw> + @tmp_addr)
201+
202+
ret void
203+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
2+
; RUN: opt -load-store-vectorizer %s -S -o - | FileCheck %s
3+
4+
target triple = "x86_64--"
5+
6+
%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } }
7+
8+
@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4
9+
10+
; Function Attrs: convergent nounwind
11+
define void @test(i32 %base) #0 {
12+
; CHECK-LABEL: @test(
13+
; CHECK-NOT: load i32
14+
; CHECK: load <2 x i32>
15+
; CHECK-NOT: load i32
16+
entry:
17+
%mul331 = and i32 %base, -4
18+
%add350.4 = add i32 4, %mul331
19+
%idx351.4 = zext i32 %add350.4 to i64
20+
%arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4
21+
%tmp296.4 = bitcast float* %arrayidx352.4 to i32*
22+
%add350.5 = add i32 5, %mul331
23+
%idx351.5 = zext i32 %add350.5 to i64
24+
%arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5
25+
%tmp296.5 = bitcast float* %arrayidx352.5 to i32*
26+
%cnd = icmp ult i32 %base, 1000
27+
br i1 %cnd, label %loads, label %exit
28+
29+
loads:
30+
; If and only if the loads are in a different BB from the GEPs codegenprepare
31+
; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job
32+
; harder
33+
%tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0
34+
%tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0
35+
br label %exit
36+
37+
exit:
38+
ret void
39+
}
40+
41+
; Function Attrs: convergent nounwind
42+
define void @test.codegenprepared(i32 %base) #0 {
43+
; CHECK-LABEL: @test.codegenprepared(
44+
; CHECK-NOT: load i32
45+
; CHECK: load <2 x i32>
46+
; CHECK-NOT: load i32
47+
entry:
48+
%mul331 = and i32 %base, -4
49+
%add350.4 = add i32 4, %mul331
50+
%idx351.4 = zext i32 %add350.4 to i64
51+
%add350.5 = add i32 5, %mul331
52+
%idx351.5 = zext i32 %add350.5 to i64
53+
%cnd = icmp ult i32 %base, 1000
54+
br i1 %cnd, label %loads, label %exit
55+
56+
loads: ; preds = %entry
57+
%sunkaddr = mul i64 %idx351.4, 4
58+
%sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr
59+
%sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
60+
%0 = bitcast i8* %sunkaddr2 to i32*
61+
%tmp297.4 = load i32, i32* %0, align 4, !tbaa !0
62+
%sunkaddr3 = mul i64 %idx351.5, 4
63+
%sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3
64+
%sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
65+
%1 = bitcast i8* %sunkaddr5 to i32*
66+
%tmp297.5 = load i32, i32* %1, align 4, !tbaa !0
67+
br label %exit
68+
69+
exit: ; preds = %loads, %entry
70+
ret void
71+
}
72+
73+
attributes #0 = { convergent nounwind }
74+
75+
!0 = !{!1, !1, i64 0}
76+
!1 = !{!"float", !2, i64 0}
77+
!2 = !{!"omnipotent char", !3, i64 0}
78+
!3 = !{!"Simple C++ TBAA"}

0 commit comments

Comments
 (0)