Skip to content

Commit 5f8f703

Browse files
committed
[LAA] Update comment, update prev test and add new test
1 parent 98e2348 commit 5f8f703

File tree

2 files changed

+107
-27
lines changed

2 files changed

+107
-27
lines changed

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2143,8 +2143,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
21432143

21442144
// It's not vectorizable if the distance is smaller than the minimum distance
21452145
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2146-
// front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2147-
// TypeByteSize (No need to plus the last gap distance).
2146+
// front needs TypeByteSize * Stride(MaxStride in case of different strides).
2147+
// Vectorizing the last iteration needs TypeByteSize (No need to plus the last
2148+
// gap distance).
21482149
//
21492150
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
21502151
// foo(int *A) {
@@ -2167,6 +2168,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
21672168
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
21682169
// the minimum distance needed is 28, which is greater than distance. It is
21692170
// not safe to do vectorization.
2171+
//
2172+
// We use MaxStride (maximum of src and sink strides), to get conservative
2173+
// lower bound on the MinDistanceNeeded in case of different strides.
21702174

21712175
// We know that Dist is positive, but it may not be constant. Use the signed
21722176
// minimum for computations below, as this ensures we compute the closest

llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll

Lines changed: 101 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33

44
@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
55

6-
; Equivalent C code for the test case:
6+
; Generated from the following C code:
77
; #define LEN 256 * 256
88
; float a[LEN];
9-
9+
;
1010
; void different_strides() {
1111
; for (int i = 0; i < LEN - 1024 - 255; i++) {
1212
; #pragma clang loop interleave(disable)
@@ -15,9 +15,11 @@
1515
; a[i + j + 1024] += a[j * 4 + i];
1616
; }
1717
; }
18-
define dso_local void @different_strides() local_unnamed_addr {
19-
; CHECK-LABEL: 'different_strides'
20-
; CHECK-NEXT: for.body4:
18+
; The load and store have different strides(4 and 16 bytes respectively) but the store
19+
; is always at safe positive distance away from the load, thus BackwardVectorizable
20+
define dso_local void @different_strides_backward_vectorizable() local_unnamed_addr {
21+
; CHECK-LABEL: 'different_strides_backward_vectorizable'
22+
; CHECK-NEXT: inner.body:
2123
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
2224
; CHECK-NEXT: Dependences:
2325
; CHECK-NEXT: BackwardVectorizable:
@@ -35,7 +37,82 @@ define dso_local void @different_strides() local_unnamed_addr {
3537
; CHECK-NEXT: SCEV assumptions:
3638
; CHECK-EMPTY:
3739
; CHECK-NEXT: Expressions re-written:
38-
; CHECK-NEXT: for.cond1.preheader:
40+
; CHECK-NEXT: outer.header:
41+
; CHECK-NEXT: Report: loop is not the innermost loop
42+
; CHECK-NEXT: Dependences:
43+
; CHECK-NEXT: Run-time memory checks:
44+
; CHECK-NEXT: Grouped accesses:
45+
; CHECK-EMPTY:
46+
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
47+
; CHECK-NEXT: SCEV assumptions:
48+
; CHECK-EMPTY:
49+
; CHECK-NEXT: Expressions re-written:
50+
;
51+
entry:
52+
br label %outer.header
53+
54+
outer.header:
55+
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
56+
%0 = add nuw nsw i64 %i, 1024
57+
br label %inner.body
58+
59+
inner.body:
60+
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
61+
%1 = shl nuw nsw i64 %j, 2
62+
%2 = add nuw nsw i64 %1, %i
63+
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
64+
%3 = load float, ptr %arrayidx, align 4
65+
%4 = add nuw nsw i64 %0, %j
66+
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
67+
%5 = load float, ptr %arrayidx8, align 4
68+
%add9 = fadd fast float %5, %3
69+
store float %add9, ptr %arrayidx8, align 4
70+
%j.next = add nuw nsw i64 %j, 1
71+
%exitcond.not = icmp eq i64 %j.next, 256
72+
br i1 %exitcond.not, label %outer.exit, label %inner.body
73+
74+
outer.exit:
75+
%i.next = add nuw nsw i64 %i, 1
76+
%outerexitcond.not = icmp eq i64 %i.next, 64257
77+
br i1 %outerexitcond.not, label %exit, label %outer.header
78+
79+
exit:
80+
ret void
81+
}
82+
83+
84+
; Generated from following C code:
85+
; void different_stride_and_not_vectorizable(){
86+
; for(int i = 0; i < LEN2; i++){
87+
; for(int j = 0 ; j < LEN; j++){
88+
; a[i + j + LEN] += a[i + 4*j];
89+
; }
90+
; }
91+
; }
92+
; The load and store have different strides, but the store and load are not at a
93+
; safe distance away from each other, thus not safe for vectorization.
94+
define dso_local void @different_stride_and_not_vectorizable() local_unnamed_addr {
95+
; CHECK-LABEL: 'different_stride_and_not_vectorizable'
96+
; CHECK-NEXT: inner.body:
97+
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
98+
; CHECK-NEXT: Unknown data dependence.
99+
; CHECK-NEXT: Dependences:
100+
; CHECK-NEXT: Unknown:
101+
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
102+
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
103+
; CHECK-EMPTY:
104+
; CHECK-NEXT: Forward:
105+
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
106+
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
107+
; CHECK-EMPTY:
108+
; CHECK-NEXT: Run-time memory checks:
109+
; CHECK-NEXT: Grouped accesses:
110+
; CHECK-EMPTY:
111+
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
112+
; CHECK-NEXT: SCEV assumptions:
113+
; CHECK-EMPTY:
114+
; CHECK-NEXT: Expressions re-written:
115+
; CHECK-NEXT: outer.header:
39116
; CHECK-NEXT: Report: loop is not the innermost loop
40117
; CHECK-NEXT: Dependences:
41118
; CHECK-NEXT: Run-time memory checks:
@@ -47,34 +124,33 @@ define dso_local void @different_strides() local_unnamed_addr {
47124
; CHECK-NEXT: Expressions re-written:
48125
;
49126
entry:
50-
br label %for.cond1.preheader
127+
br label %outer.header
51128

52-
for.cond1.preheader:
53-
%indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
54-
%0 = add nuw nsw i64 %indvars.iv25, 1024
55-
br label %for.body4
129+
outer.header:
130+
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
131+
%0 = add nuw nsw i64 %i, 256
132+
br label %inner.body
56133

57-
for.cond.cleanup:
134+
exit:
58135
ret void
59136

60-
for.cond.cleanup3:
61-
%indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
62-
%exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
63-
br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
137+
outer.exit:
138+
%i.next = add nuw nsw i64 %i, 1
139+
%exitcond29.not = icmp eq i64 %i.next, 65536
140+
br i1 %exitcond29.not, label %exit, label %outer.header
64141

65-
for.body4:
66-
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
67-
%1 = shl nuw nsw i64 %indvars.iv, 2
68-
%2 = add nuw nsw i64 %1, %indvars.iv25
142+
inner.body:
143+
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
144+
%1 = shl nuw nsw i64 %j, 2
145+
%2 = add nuw nsw i64 %1, %i
69146
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
70147
%3 = load float, ptr %arrayidx, align 4
71-
%4 = add nuw nsw i64 %0, %indvars.iv
148+
%4 = add nuw nsw i64 %0, %j
72149
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
73150
%5 = load float, ptr %arrayidx8, align 4
74151
%add9 = fadd fast float %5, %3
75152
store float %add9, ptr %arrayidx8, align 4
76-
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
77-
%exitcond.not = icmp eq i64 %indvars.iv.next, 256
78-
br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
153+
%j.next = add nuw nsw i64 %j, 1
154+
%exitcond.not = icmp eq i64 %j.next, 256
155+
br i1 %exitcond.not, label %outer.exit, label %inner.body
79156
}
80-

0 commit comments

Comments
 (0)