-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Don't vectorize for loops with small trip count #132176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesInspired by https://reviews.llvm.org/D130755. I don't know the logic behind the value 5, it is copied from AArch64. For some tests, I have to change the trip count so that we don't Patch is 45.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132176.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 8ffe1b08d1e26..3982f30da01a7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -388,6 +388,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
bool enableInterleavedAccessVectorization() { return true; }
+ unsigned getMinTripCountTailFoldingThreshold() const {
+ return ST->hasVInstructions() ? 5 : 0;
+ }
+
enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
unsigned getNumberOfRegisters(unsigned ClassID) const {
switch (ClassID) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 10ac870c112ae..852e7a18ecbb8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -45,47 +45,19 @@ for.end: ; preds = %for.body
define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip3_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -112,47 +84,19 @@ for.end: ; preds = %for.body
define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip5_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -219,7 +163,7 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -277,7 +221,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -336,7 +280,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -380,7 +324,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -397,7 +341,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index da859963a7021..4f05292c6fb94 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -5,65 +5,149 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ptr [[P:%.*]], i64 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT]], splat (i64 48)
-; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i64> [[TMP2]], splat (i64 52)
-; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
-; CHECK-NEXT: br label [[VECTOR_BODY1:%.*]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = shl <16 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
+; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i64> [[TMP0]], splat (i64 52)
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 3)
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8)
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE32]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP4]], i32 9)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <16 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP3]], <16 x i32> [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i8> [[TMP8]], i32 0
-; CHECK-NEXT: store i8 [[TMP10]], ptr [[P]], align 1
-; CHECK-NEXT: br label [[VECTOR_BODY]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP8]], i32 0
+; CHECK-NEXT: store i8 [[TMP19]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK-NEXT: [[CMP_N:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK: pred.store.if3:
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i8> [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[TMP8]], i32 1
; CHECK-NEXT: store i8 [[TMP12]], ptr [[P]], align 1
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: pred.store.continue4:
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
; CHECK: pred.store.if5:
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[TMP8]], i32 2
; CHECK-NEXT: store i8 [[TMP14]], ptr [[P]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
; CHECK: pred.store.if7:
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[TMP8]], i32 3
; CHECK-NEXT: store i8 [[TMP16]], ptr [[P]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
; CHECK: pred.store.continue8:
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
+; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK: pred.store.if9:
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[TMP8]], i32 4
+; CHECK-NEXT: store i8 [[TMP18]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; CHECK: pred.store.continue10:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
+; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK: pred.store.if11:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[TMP8]], i32 5
+; CHECK-NEXT: store i8 [[TMP20]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; CHECK: pred.store.continue12:
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
+; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK: pred.store.if13:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[TMP8]], i32 6
+; CHECK-NEXT: store i8 [[TMP22]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]]
+; CHECK: pred.store.continue14:
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
+; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK: pred.store.if15:
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[TMP8]], i32 7
+; CHECK-NEXT: store i8 [[TMP24]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]]
+; CHECK: pred.store.continue16:
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
+; ...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Pengcheng Wang (wangpc-pp) ChangesInspired by https://reviews.llvm.org/D130755. I don't know the logic behind the value 5, it is copied from AArch64. For some tests, I have to change the trip count so that we don't Patch is 45.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132176.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 8ffe1b08d1e26..3982f30da01a7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -388,6 +388,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
bool enableInterleavedAccessVectorization() { return true; }
+ unsigned getMinTripCountTailFoldingThreshold() const {
+ return ST->hasVInstructions() ? 5 : 0;
+ }
+
enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
unsigned getNumberOfRegisters(unsigned ClassID) const {
switch (ClassID) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 10ac870c112ae..852e7a18ecbb8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -45,47 +45,19 @@ for.end: ; preds = %for.body
define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip3_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -112,47 +84,19 @@ for.end: ; preds = %for.body
define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip5_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -219,7 +163,7 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -277,7 +221,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -336,7 +280,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -380,7 +324,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -397,7 +341,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index da859963a7021..4f05292c6fb94 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -5,65 +5,149 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ptr [[P:%.*]], i64 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT]], splat (i64 48)
-; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i64> [[TMP2]], splat (i64 52)
-; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
-; CHECK-NEXT: br label [[VECTOR_BODY1:%.*]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = shl <16 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
+; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i64> [[TMP0]], splat (i64 52)
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 3)
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8)
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE32]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP4]], i32 9)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <16 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP3]], <16 x i32> [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i8> [[TMP8]], i32 0
-; CHECK-NEXT: store i8 [[TMP10]], ptr [[P]], align 1
-; CHECK-NEXT: br label [[VECTOR_BODY]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP8]], i32 0
+; CHECK-NEXT: store i8 [[TMP19]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK-NEXT: [[CMP_N:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
; CHECK: pred.store.if3:
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i8> [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[TMP8]], i32 1
; CHECK-NEXT: store i8 [[TMP12]], ptr [[P]], align 1
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: pred.store.continue4:
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
; CHECK: pred.store.if5:
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[TMP8]], i32 2
; CHECK-NEXT: store i8 [[TMP14]], ptr [[P]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
; CHECK: pred.store.if7:
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[TMP8]], i32 3
; CHECK-NEXT: store i8 [[TMP16]], ptr [[P]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
; CHECK: pred.store.continue8:
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
+; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK: pred.store.if9:
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[TMP8]], i32 4
+; CHECK-NEXT: store i8 [[TMP18]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; CHECK: pred.store.continue10:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
+; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK: pred.store.if11:
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[TMP8]], i32 5
+; CHECK-NEXT: store i8 [[TMP20]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; CHECK: pred.store.continue12:
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
+; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK: pred.store.if13:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[TMP8]], i32 6
+; CHECK-NEXT: store i8 [[TMP22]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]]
+; CHECK: pred.store.continue14:
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
+; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK: pred.store.if15:
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[TMP8]], i32 7
+; CHECK-NEXT: store i8 [[TMP24]], ptr [[P]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]]
+; CHECK: pred.store.continue16:
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
+; ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I’ve been using 3 locally, but I don’t have any objections to using 5.
I was thinking that we may skip vectorization when we can't occupy one whole vector register, but here we can only specify the trip count, not the total bytes we will process. |
I will land this if there are no more comments today (Monday). |
Do we still want to vectorize small trip counts with EVL tail folding? I can also queue up a performance run for this on the BPI-F3 (without tail folding) |
I think we should not, EVL tail folding doesn't help a lot for small trip count loops. And SLP can handle them if profitable.
That'll be helpful! Based on my experience years ago, we should have some gains on SPEC CPU because LV vectorizes some small loops and causes more instructions/stalls. |
Statistics on llvm-test-suite:
5 seems reasonable. |
dd629d5
to
01f8b8b
Compare
Inspired by https://reviews.llvm.org/D130755. I don't know the logic behind the value 5, it is copied from AArch64. For some tests, I have to change the trip count so that we don't break what they are testing.
01f8b8b
to
2f782c9
Compare
Inspired by https://reviews.llvm.org/D130755.
I don't know the logic behind the value 5, it is copied from AArch64.
For some tests, I have to change the trip count so that we don't
break what they are testing.