[LoopCacheAnalysis] Fix loop cache cost to always round the cost up to the nearest integer number.

RouzbehPaktinat · RouzbehPaktinat · commit 73b25935054e · 2024-04-24T10:42:18.000-04:00
Currently loop cache analysis uses following formula to evaluate cost of an RefGroup for a consecutive memory access: "RefCost=(TripCount*Stride)/CLS". When the cost is fractional, it's always rounded to the smaller integer number but this is problematic specially when when the cost is calculated to be a number less than one which will be rounded to zero. But it makes more sense to say one cache line is used in this case rather than zero cache lines.

This patch fixes the problem by always rounding the cost up to the nearest larger integer number.
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -299,7 +299,12 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
     TripCount = SE.getNoopOrZeroExtend(TripCount, WiderType);
     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
-    RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+    // Round the fractional cost up to the nearest integer number.
+    // The impact is the most significant when cost is calculated
+    // to be a number less than one, because it makes more sense
+    // to say one cache line is used rather than zero cache line
+    // is used.
+    RefCost = SE.getUDivCeilSCEV(Numerator, CacheLineSize);
 
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -7,7 +7,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll
 
 ; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK-NEXT: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 130944
 
 ;; #define N 1024
 ;; #define M 2048
@@ -49,7 +49,7 @@ for.end13:                                        ; preds = %for.inc11
 
 
 ; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK-NEXT: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 130944
 
 define void @t2(ptr %a) {
 entry:
@@ -87,7 +87,7 @@ declare ptr @func_with_returned_arg(ptr returned %arg)
 ; CHECK-NEXT: Loop 'for.body4' has cost = 16762927104000000
 ; CHECK-NEXT: Loop 'for.body8' has cost = 130960368000000
 ; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
-; CHECK-NEXT: Loop 'for.body16' has cost = 32260032000
+; CHECK-NEXT: Loop 'for.body16' has cost = 32772096000
 
 ;; #define N 128
 ;; #define M 2048
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll
@@ -38,7 +38,7 @@ for.end:                                          ; preds = %for.cond
 
 ; CHECK: Loop 'for.cond' has cost = 100000000
 ; CHECK: Loop 'for.cond1' has cost = 1000000
-; CHECK: Loop 'for.cond5' has cost = 30000
+; CHECK: Loop 'for.cond5' has cost = 40000
 
 @data = external dso_local global [2 x [4 x [18 x i32]]], align 1
 
@@ -118,7 +118,7 @@ for.neg.end:                                          ; preds = %for.neg.cond
 ; access functions. When this is fixed this testcase should have a cost
 ; approximately 2x higher.
 
-; CHECK: Loop 'for.cond2' has cost = 2560
+; CHECK: Loop 'for.cond2' has cost = 2561
 define void @Test2(ptr %B) {
 entry:
   br label %for.cond2
@@ -148,7 +148,7 @@ for.end:                                          ; preds = %for.cond
 ;   for (i = 40960; i > 0; i--)
 ;     C[i] = C[i];
 
-; CHECK: Loop 'for.cond3' has cost = 2560
+; CHECK: Loop 'for.cond3' has cost = 2561
 define void @Test3(ptr %C) {
 entry:
   br label %for.cond3
@@ -177,7 +177,7 @@ for.end:                                          ; preds = %for.cond
 ;  for (i = 0; i < 40960; i++)
 ;     D[i] = D[i];
 
-; CHECK: Loop 'for.cond4' has cost = 2560
+; CHECK: Loop 'for.cond4' has cost = 2561
 define void @Test4(ptr %D) {
 entry:
   br label %for.cond4
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; }
 
 ; CHECK: Loop 'for.i' has cost = 3000000
-; CHECK-NEXT: Loop 'for.k' has cost = 2030000
-; CHECK-NEXT: Loop 'for.j' has cost = 1060000
+; CHECK-NEXT: Loop 'for.k' has cost = 2040000
+; CHECK-NEXT: Loop 'for.j' has cost = 1080000
 
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A, ptr %B, ptr %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; }
 
 ; CHECK:Loop 'for.i' has cost = 2010000
-; CHECK-NEXT:Loop 'for.k' has cost = 1040000
-; CHECK-NEXT:Loop 'for.j' has cost = 70000
+; CHECK-NEXT:Loop 'for.k' has cost = 1050000
+; CHECK-NEXT:Loop 'for.j' has cost = 90000
     
 define void @matmul(i64 %n, i64 %m, i64 %o, ptr %A, ptr %B, ptr %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
@@ -17,8 +17,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; CHECK: Loop 'k_loop' has cost = 10200000000000000
 ; CHECK-NEXT: Loop 'j_loop' has cost = 102000000000000
 ; CHECK-NEXT: Loop 'i_loop' has cost = 1020000000000
-; CHECK-NEXT: Loop 'm_loop' has cost = 10700000000
-; CHECK-NEXT: Loop 'l_loop' has cost = 1300000000
+; CHECK-NEXT: Loop 'm_loop' has cost = 10800000000
+; CHECK-NEXT: Loop 'l_loop' has cost = 1500000000
 
 %_elem_type_of_double = type <{ double }>
 
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
@@ -5,7 +5,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 
 ; CHECK: Loop 'for.j' has cost = 201000000
 ; CHECK-NEXT: Loop 'for.i' has cost = 102000000
-; CHECK-NEXT: Loop 'for.k' has cost = 90000
+; CHECK-NEXT: Loop 'for.k' has cost = 120000
 
 ;; Test to make sure when we have multiple conflicting access patterns, the 
 ;; chosen loop configuration favours the majority of those accesses. 
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
@@ -12,7 +12,7 @@ target triple = "powerpc64le-unknown-linux-gnu"
 
 ; CHECK: Loop 'for.i' has cost = 100000000
 ; CHECK-NEXT: Loop 'for.j' has cost = 1000000
-; CHECK-NEXT: Loop 'for.k' has cost = 60000
+; CHECK-NEXT: Loop 'for.k' has cost = 70000
 
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A) {
 entry:
@@ -90,7 +90,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; CHECK: Loop 'for.i' has cost = 100000000
 ; CHECK-NEXT: Loop 'for.j' has cost = 1000000
-; CHECK-NEXT: Loop 'for.k' has cost = 60000
+; CHECK-NEXT: Loop 'for.k' has cost = 70000
 
 define void @foo2(i64 %n, i64 %m, i64 %o, ptr %A) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
@@ -11,8 +11,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ;     }   
 ; }
 
-; CHECK: Loop 'for.i' has cost = 20600
-; CHECK-NEXT: Loop 'for.j' has cost = 800
+; CHECK: Loop 'for.i' has cost = 20800
+; CHECK-NEXT: Loop 'for.j' has cost = 1000
 
 define void @foo(i64 %n, i64 %m, ptr %A, ptr %B, ptr %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/compute-cost.ll
@@ -8,6 +8,9 @@
 ; Check IndexedReference::computeRefCost can handle type differences between
 ; Stride and TripCount
 
+; Round costs up to the nearest whole number i.e. in 'for.cond5' cost is calculated 12.5 and
+; it makes more sense to say 13 cache lines are used rather than 12 cache lines.
+
 ; SMALLER-CACHELINE: Loop 'for.cond' has cost = 256
 ; LARGER-CACHELINE: Loop 'for.cond' has cost = 32
 %struct._Handleitem = type { ptr }
@@ -40,10 +43,10 @@ for.end:                                          ; preds = %for.cond
 
 ; SMALLER-CACHELINE: Loop 'for.cond' has cost = 100000000
 ; SMALLER-CACHELINE: Loop 'for.cond1' has cost = 1000000
-; SMALLER-CACHELINE: Loop 'for.cond5' has cost = 120000
+; SMALLER-CACHELINE: Loop 'for.cond5' has cost = 130000
 ; LARGER-CACHELINE: Loop 'for.cond' has cost = 100000000
 ; LARGER-CACHELINE: Loop 'for.cond1' has cost = 1000000
-; LARGER-CACHELINE: Loop 'for.cond5' has cost = 10000
+; LARGER-CACHELINE: Loop 'for.cond5' has cost = 20000
 @data = external dso_local global [2 x [4 x [18 x i32]]], align 1
 
 define dso_local void @handle_to_ptr_2(i1 %b0, i1 %b1, i1 %b2) {
@@ -122,8 +125,8 @@ for.neg.end:                                          ; preds = %for.neg.cond
 ; access functions. When this is fixed this testcase should have a cost
 ; approximately 2x higher.
 
-; SMALLER-CACHELINE: Loop 'for.cond2' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond2' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond2' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond2' has cost = 1281
 define void @Test2(ptr %B) {
 entry:
   br label %for.cond2
@@ -153,8 +156,8 @@ for.end:                                          ; preds = %for.cond
 ;   for (i = 40960; i > 0; i--)
 ;     C[i] = C[i];
 
-; SMALLER-CACHELINE: Loop 'for.cond3' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond3' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond3' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond3' has cost = 1281
 define void @Test3(ptr %C) {
 entry:
   br label %for.cond3
@@ -183,8 +186,8 @@ for.end:                                          ; preds = %for.cond
 ;  for (i = 0; i < 40960; i++)
 ;     D[i] = D[i];
 
-; SMALLER-CACHELINE: Loop 'for.cond4' has cost = 10240
-; LARGER-CACHELINE: Loop 'for.cond4' has cost = 1280
+; SMALLER-CACHELINE: Loop 'for.cond4' has cost = 10241
+; LARGER-CACHELINE: Loop 'for.cond4' has cost = 1281
 define void @Test4(ptr %D) {
 entry:
   br label %for.cond4
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll b/llvm/test/Analysis/LoopCacheAnalysis/interchange-cost-beneficial.ll
@@ -0,0 +1,62 @@
+; RUN: opt <  %s  -cache-line-size=64 -passes='print<loop-cache-cost>' -disable-output 2>&1 | FileCheck  %s
+
+;; This test checks the effect of rounding cache cost to 1 when it is 
+;; evaluated to 0 because at least 1 cache line is accessed by the loopnest.
+;; It does not make sense to output that zero cache lines are used.
+;; The cost of reference group for B[j], C[j], D[j] and E[j] were 
+;; calculted 0 before but now they are 1 which makes each loop cost more reasonable.
+;
+; void test(int n, int m, int o, int A[2][3], int B[2], int C[2], int D[2], int E[2]) {
+;   for (int i = 0; i < 3; i++)
+;     for (int j = 0; j < 2; j++)
+;        A[j][i] = 1;
+;        B[j] = 1;
+;        C[j] = 1;
+;        D[j] = 1
+;        E[j] = 1
+; }
+
+; CHECK: Loop 'for.j' has cost = 18
+; CHECK-NEXT: Loop 'for.i' has cost = 10
+
+define void @test(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E) {
+
+entry:
+  br label %for.i.preheader.split
+
+for.i.preheader.split:                            ; preds = %for.i.preheader
+  br label %for.i
+
+for.i:                                            ; preds = %for.inci, %for.i.preheader.split
+  %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ]
+  br label %for.j
+
+for.j:                                            ; preds = %for.incj, %for.i
+  %j = phi i64 [ %incj, %for.j ], [ 0, %for.i ]
+  %mul_j = mul nsw i64 %j, 3
+  %index_j = add i64 %mul_j, %i
+  %arrayidxA = getelementptr inbounds [2 x [ 3 x i32]], ptr %A, i64 %j, i64 %i
+  store i32 1, ptr %arrayidxA, align 4
+  %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %j
+  store i32 1, ptr %arrayidxB, align 4
+  %arrayidxC = getelementptr inbounds i32, ptr %C, i64 %j
+  store i32 1, ptr %arrayidxC, align 4
+  %arrayidxD = getelementptr inbounds i32, ptr %D, i64 %j
+  store i32 1, ptr %arrayidxD, align 4
+  %arrayidxE = getelementptr inbounds i32, ptr %E, i64 %j
+  store i32 1, ptr %arrayidxE, align 4
+  %incj = add nsw i64 %j, 1
+  %exitcond.us = icmp eq i64 %incj, 2
+  br i1 %exitcond.us, label %for.inci, label %for.j
+
+for.inci:                                         ; preds = %for.incj
+  %inci = add nsw i64 %i, 1
+  %exitcond55.us = icmp eq i64 %inci, 3
+  br i1 %exitcond55.us, label %for.end.loopexit, label %for.i
+
+for.end.loopexit:                                 ; preds = %for.inci
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@@ -1,42 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file=%t %s
 
 @b = external dso_local global [5 x i32], align 16
 
+;; Not profitable to interchange, because the access is invariant to j loop.
+;;
+;;  for(int i=0;i<4;i++) {
+;;    for(int j=1;j<4;j++) {
+;;      b[i] = ....
+;;    }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test1
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+
 define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.body2.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    store i32 undef, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.inc.split:
-; CHECK-NEXT:    [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_INC3]]
-; CHECK:       for.inc3:
-; CHECK-NEXT:    [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK:       for.cond.for.end5_crit_edge:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -68,41 +51,15 @@ for.cond.for.end5_crit_edge:                      ; preds = %for.inc3
   ret void
 }
 
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+
 define void @test2() {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY2_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INC41:%.*]] = phi i32 [ [[INC4:%.*]], [[FOR_INC3:%.*]] ], [ undef, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INC41]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], ptr @b, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.body2.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
-; CHECK:       for.body2:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_INC_SPLIT:%.*]] ], [ 1, [[FOR_BODY2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    [[CMP_ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store i32 [[CMP_ZEXT]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; CHECK:       for.inc.split:
-; CHECK-NEXT:    [[TMP1]] = add nuw nsw i32 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[LSR_IV]], 4
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY2]], label [[FOR_COND_FOR_END5_CRIT_EDGE:%.*]]
-; CHECK:       for.cond1.for.end_crit_edge:
-; CHECK-NEXT:    br label [[FOR_INC3]]
-; CHECK:       for.inc3:
-; CHECK-NEXT:    [[INC4]] = add nsw i32 [[INC41]], 1
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[FOR_INC_SPLIT]]
-; CHECK:       for.cond.for.end5_crit_edge:
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %for.body