|
1 | 1 | ; REQUIRES: asserts
|
2 | 2 | ; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
|
| 3 | +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE |
3 | 4 |
|
4 | 5 | target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
5 | 6 | target triple = "x86_64-unknown-linux-gnu"
|
@@ -65,3 +66,85 @@ for.end:
|
65 | 66 | }
|
66 | 67 |
|
67 | 68 | attributes #0 = { "target-cpu"="knl" }
|
| 69 | + |
| 70 | +; CHECK-LABEL: PR40816 |
| 71 | +; |
| 72 | +; Check that scalar with predication instructions are not considered uniform |
| 73 | +; after vectorization, because that results in replicating a region instead of |
| 74 | +; having a single instance (out of VF). The predication stems from a tiny count |
| 75 | +; of 3 leading to folding the tail by masking using icmp ule <i, i+1> <= <2, 2>. |
| 76 | +; |
| 77 | +; CHECK: LV: Found trip count: 3 |
| 78 | +; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 |
| 79 | +; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, i32* {{%.*}}, align 1 |
| 80 | +; CHECK: LV: Found not uniform being ScalarWithPredication: {{%.*}} = load i32, i32* {{%.*}}, align 1 |
| 81 | +; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}} |
| 82 | +; |
| 83 | +; FORCE-LABEL: @PR40816( |
| 84 | +; FORCE-NEXT: entry: |
| 85 | +; FORCE-NEXT: br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]] |
| 86 | +; FORCE: vector.ph: |
| 87 | +; FORCE-NEXT: br label [[VECTOR_BODY:%.*]] |
| 88 | +; FORCE: vector.body: |
| 89 | +; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] |
| 90 | +; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ] |
| 91 | +; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 |
| 92 | +; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 |
| 93 | +; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2> |
| 94 | +; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 |
| 95 | +; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] |
| 96 | +; FORCE: pred.store.if: |
| 97 | +; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1 |
| 98 | +; FORCE-NEXT: br label [[PRED_STORE_CONTINUE]] |
| 99 | +; FORCE: pred.store.continue: |
| 100 | +; FORCE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 |
| 101 | +; FORCE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] |
| 102 | +; FORCE: pred.store.if1: |
| 103 | +; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1 |
| 104 | +; FORCE-NEXT: br label [[PRED_STORE_CONTINUE2]] |
| 105 | +; FORCE: pred.store.continue2: |
| 106 | +; FORCE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 |
| 107 | +; FORCE-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] |
| 108 | +; FORCE: pred.load.if: |
| 109 | +; FORCE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]] |
| 110 | +; FORCE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1 |
| 111 | +; FORCE-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0 |
| 112 | +; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]] |
| 113 | +; FORCE: pred.load.continue: |
| 114 | +; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] |
| 115 | +; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 |
| 116 | +; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]] |
| 117 | +; FORCE: pred.load.if3: |
| 118 | +; FORCE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]] |
| 119 | +; FORCE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1 |
| 120 | +; FORCE-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1 |
| 121 | +; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]] |
| 122 | +; FORCE: pred.load.continue4: |
| 123 | +; FORCE-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ] |
| 124 | +; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 |
| 125 | +; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2> |
| 126 | +; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 |
| 127 | +; FORCE-NEXT: br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]] |
| 128 | +; |
| 129 | +@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 |
| 130 | +@b = external global i32, align 1 |
| 131 | + |
| 132 | +define void @PR40816() #1 { |
| 133 | + |
| 134 | +entry: |
| 135 | + br label %for.body |
| 136 | + |
| 137 | +for.body: ; preds = %for.body, %entry |
| 138 | + %0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] |
| 139 | + store i32 %0, i32* @b, align 1 |
| 140 | + %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0 |
| 141 | + %1 = load i32, i32* %arrayidx1, align 1 |
| 142 | + %cmp2 = icmp eq i32 %1, 0 |
| 143 | + %inc = add nuw nsw i32 %0, 1 |
| 144 | + br i1 %cmp2, label %return, label %for.body |
| 145 | + |
| 146 | +return: ; preds = %for.body |
| 147 | + ret void |
| 148 | +} |
| 149 | + |
| 150 | +attributes #1 = { "target-cpu"="core2" } |
0 commit comments