Skip to content

Commit f47a24c

Browse files
committed
Rebase
1 parent 91122e4 commit f47a24c

File tree

8 files changed

+109
-90
lines changed

8 files changed

+109
-90
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2721,7 +2721,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
27212721
VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
27222722
VPWidenCastRecipe *Ext0,
27232723
VPWidenCastRecipe *Ext1, Type *ResultTy,
2724-
unsigned ScaleFactor = 0)
2724+
unsigned ScaleFactor = 1)
27252725
: VPReductionRecipe(
27262726
VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
27272727
{R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,7 +2572,7 @@ VPExtendedReductionRecipe::computeCost(ElementCount VF,
25722572
InstructionCost
25732573
VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
25742574
VPCostContext &Ctx) const {
2575-
if (getVFScaleFactor() > 0) {
2575+
if (getVFScaleFactor() > 1) {
25762576
return Ctx.TTI.getPartialReductionCost(
25772577
Instruction::Add, Ctx.Types.inferScalarType(getVecOp0()),
25782578
Ctx.Types.inferScalarType(getVecOp1()), getResultType(), VF,
@@ -2658,7 +2658,7 @@ void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
26582658
O << " = ";
26592659
getChainOp()->printAsOperand(O, SlotTracker);
26602660
O << " + ";
2661-
if (getVFScaleFactor() > 0)
2661+
if (getVFScaleFactor() > 1)
26622662
O << "partial.";
26632663
O << "reduce."
26642664
<< Instruction::getOpcodeName(

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,7 +2583,7 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
25832583

25842584
// Generate VPReductionRecipe.
25852585
VPReductionRecipe *Red = nullptr;
2586-
if (unsigned ScaleFactor = MulAcc->getVFScaleFactor())
2586+
if (unsigned ScaleFactor = MulAcc->getVFScaleFactor(); ScaleFactor > 1)
25872587
Red = new VPPartialReductionRecipe(Instruction::Add, MulAcc->getChainOp(),
25882588
Mul, MulAcc->getCondOp(), ScaleFactor);
25892589
else
@@ -2940,7 +2940,8 @@ tryToCreateAbstractPartialReductionRecipe(VPPartialReductionRecipe *PRed) {
29402940
return;
29412941

29422942
auto *AbstractR = new VPMulAccumulateReductionRecipe(
2943-
PRed, BinOpR, Ext0R, Ext1R, Ext0R->getResultType());
2943+
PRed, BinOpR, Ext0R, Ext1R, Ext0R->getResultType(),
2944+
PRed->getVFScaleFactor());
29442945
AbstractR->insertBefore(PRed);
29452946
PRed->replaceAllUsesWith(AbstractR);
29462947
}

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
2222
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
2323
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
2424
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
25-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
25+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
2626
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
2727
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
2828
; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ define i32 @dotp(ptr %a, ptr %b) {
1919
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
2020
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
2121
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
22-
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
22+
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
2323
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
2424
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
2525
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
@@ -49,7 +49,7 @@ define i32 @dotp(ptr %a, ptr %b) {
4949
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
5050
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
5151
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
52-
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
52+
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
5353
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
5454
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
5555
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
@@ -83,7 +83,7 @@ define i32 @dotp(ptr %a, ptr %b) {
8383
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
8484
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
8585
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
86-
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
86+
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
8787
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
8888
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
8989
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
@@ -800,37 +800,68 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
800800
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
801801
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
802802
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
803+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
803804
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
805+
; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
804806
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
807+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
805808
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
806809
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
807810
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
808-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
811+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
812+
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
813+
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
814+
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP36]]
815+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP37]])
809816
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
817+
; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
810818
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
819+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP41]], align 1
811820
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
821+
; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
812822
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
823+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
813824
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
814825
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
815826
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
816-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
827+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]])
828+
; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
829+
; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
830+
; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP46]]
831+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP47]])
817832
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
833+
; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
818834
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
835+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP51]], align 1
819836
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
837+
; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
820838
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
839+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
821840
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
822841
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
823842
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
824-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
843+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
844+
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
845+
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
846+
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]]
847+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
825848
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
849+
; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
826850
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
851+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
827852
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
853+
; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
828854
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
855+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP44]], align 1
829856
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
830857
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
831858
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
832859
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
833-
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
860+
; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
861+
; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
862+
; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP49]]
863+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
864+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
834865
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
835866
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
836867
; CHECK-INTERLEAVED: middle.block:
@@ -915,6 +946,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
915946
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
916947
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
917948
; CHECK-MAXBW: scalar.ph:
949+
;
918950
entry:
919951
br label %for.body
920952

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
110110
; CHECK-MAXBW: vector.body:
111111
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
112112
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
113-
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
114-
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
113+
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
115114
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
116115
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
117-
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
116+
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
118117
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
119118
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
120119
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
@@ -262,10 +261,8 @@ define i64 @dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b) #1
262261
; CHECK-MAXBW: vector.body:
263262
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
264263
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
265-
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
266-
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP8]]
267-
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
268-
; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
264+
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
265+
; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
269266
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
270267
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
271268
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
@@ -1407,10 +1404,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
14071404
; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
14081405
; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
14091406
; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
1410-
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
1411-
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
1412-
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
1413-
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
1407+
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
1408+
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
1409+
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1
14141410
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
14151411
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
14161412
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX]], 2
@@ -1636,7 +1632,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
16361632
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
16371633
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
16381634
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
1639-
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
1635+
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
16401636
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
16411637
; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
16421638
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
@@ -1943,11 +1939,10 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
19431939
; CHECK-MAXBW: vector.body:
19441940
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
19451941
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
1946-
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
1947-
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
1942+
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
19481943
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
19491944
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
1950-
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
1945+
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
19511946
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
19521947
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
19531948
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
@@ -2567,10 +2562,8 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
25672562
; CHECK-MAXBW: vector.body:
25682563
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25692564
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
2570-
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
2571-
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
2572-
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
2573-
; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
2565+
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
2566+
; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
25742567
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
25752568
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
25762569
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0

0 commit comments

Comments
 (0)