Skip to content

Commit ed03070

Browse files
authored
[SLP] Support vectorizing 2^N-1 reductions (#106266)
Build on the -slp-vectorize-non-power-of-2 experimental option, and support vectorizing reductions with 2^N-1 sized vector. Specifically, two related changes: 1) When searching for a profitable VL, start with the 2^N-1 reduction width. If cost model does not select that VL, return to power of two boundaries when halfing the search VL. The later is mostly for simplicity. 2) Reduce the minimum reduction width from 4 to 3 when supporting non-power of two vectors. This is required to support <3 x Ty> cases. One thing which isn't directly related to this change, but I want to note for clarity is that the non-power-of-two vectorization appears to be sensative to operand order of reduction. I haven't yet fully figured out why, but I suspect this is non-power-of-two specific.
1 parent edbd9d1 commit ed03070

File tree

2 files changed

+127
-89
lines changed

2 files changed

+127
-89
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17583,7 +17583,7 @@ class HorizontalReduction {
1758317583
/// Attempt to vectorize the tree found by matchAssociativeReduction.
1758417584
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
1758517585
const TargetLibraryInfo &TLI) {
17586-
constexpr int ReductionLimit = 4;
17586+
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
1758717587
constexpr unsigned RegMaxNumber = 4;
1758817588
constexpr unsigned RedValsMaxNumber = 128;
1758917589
// If there are a sufficient number of reduction values, reduce
@@ -17799,13 +17799,15 @@ class HorizontalReduction {
1779917799

1780017800
unsigned MaxVecRegSize = V.getMaxVecRegSize();
1780117801
unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17802-
unsigned MaxElts =
17803-
RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17802+
const unsigned MaxElts = std::clamp<unsigned>(
17803+
llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
17804+
RegMaxNumber * RedValsMaxNumber);
17805+
17806+
unsigned ReduxWidth = NumReducedVals;
17807+
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
17808+
ReduxWidth = bit_floor(ReduxWidth);
17809+
ReduxWidth = std::min(ReduxWidth, MaxElts);
1780417810

17805-
unsigned ReduxWidth = std::min<unsigned>(
17806-
llvm::bit_floor(NumReducedVals),
17807-
std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17808-
RegMaxNumber * RedValsMaxNumber));
1780917811
unsigned Start = 0;
1781017812
unsigned Pos = Start;
1781117813
// Restarts vectorization attempt with lower vector factor.
@@ -17825,7 +17827,7 @@ class HorizontalReduction {
1782517827
if (Pos < NumReducedVals - ReduxWidth + 1)
1782617828
return IsAnyRedOpGathered;
1782717829
Pos = Start;
17828-
ReduxWidth /= 2;
17830+
ReduxWidth = bit_ceil(ReduxWidth) / 2;
1782917831
return IsAnyRedOpGathered;
1783017832
};
1783117833
bool AnyVectorized = false;
@@ -18014,12 +18016,10 @@ class HorizontalReduction {
1801418016
createStrideMask(I, ScalarTyNumElements, VL.size());
1801518017
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
1801618018
ReducedSubTree = Builder.CreateInsertElement(
18017-
ReducedSubTree, emitReduction(Lane, Builder, ReduxWidth, TTI),
18018-
I);
18019+
ReducedSubTree, emitReduction(Lane, Builder, TTI), I);
1801918020
}
1802018021
} else {
18021-
ReducedSubTree =
18022-
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
18022+
ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI);
1802318023
}
1802418024
if (ReducedSubTree->getType() != VL.front()->getType()) {
1802518025
assert(ReducedSubTree->getType() != VL.front()->getType() &&
@@ -18301,10 +18301,8 @@ class HorizontalReduction {
1830118301

1830218302
/// Emit a horizontal reduction of the vectorized value.
1830318303
Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
18304-
unsigned ReduxWidth, const TargetTransformInfo *TTI) {
18304+
const TargetTransformInfo *TTI) {
1830518305
assert(VectorizedValue && "Need to have a vectorized tree node");
18306-
assert(has_single_bit(ReduxWidth) &&
18307-
"We only handle power-of-two reductions for now");
1830818306
assert(RdxKind != RecurKind::FMulAdd &&
1830918307
"A call to the llvm.fmuladd intrinsic is not handled yet");
1831018308

llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll

Lines changed: 114 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -434,16 +434,22 @@ define i32 @reduce_add(ptr %src) {
434434
}
435435

436436
define float @reduce_fadd(ptr %src) {
437-
; CHECK-LABEL: @reduce_fadd(
438-
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
439-
; CHECK-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
440-
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1
441-
; CHECK-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
442-
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
443-
; CHECK-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
444-
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]]
445-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]]
446-
; CHECK-NEXT: ret float [[ADD_1]]
437+
; NON-POW2-LABEL: @reduce_fadd(
438+
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
439+
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
440+
; NON-POW2-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float -0.000000e+00, <3 x float> [[TMP1]])
441+
; NON-POW2-NEXT: ret float [[TMP2]]
442+
;
443+
; POW2-ONLY-LABEL: @reduce_fadd(
444+
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
445+
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
446+
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1
447+
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
448+
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
449+
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
450+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]]
451+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]]
452+
; POW2-ONLY-NEXT: ret float [[ADD_1]]
447453
;
448454
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
449455
%l.src.0 = load float, ptr %gep.src.0, align 4
@@ -458,19 +464,26 @@ define float @reduce_fadd(ptr %src) {
458464
}
459465

460466
define i32 @reduce_add_after_mul(ptr %src) {
461-
; CHECK-LABEL: @reduce_add_after_mul(
462-
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
463-
; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
464-
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
465-
; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
466-
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
467-
; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
468-
; CHECK-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
469-
; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
470-
; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
471-
; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
472-
; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
473-
; CHECK-NEXT: ret i32 [[ADD_1]]
467+
; NON-POW2-LABEL: @reduce_add_after_mul(
468+
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
469+
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
470+
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], <i32 10, i32 10, i32 10>
471+
; NON-POW2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]])
472+
; NON-POW2-NEXT: ret i32 [[TMP3]]
473+
;
474+
; POW2-ONLY-LABEL: @reduce_add_after_mul(
475+
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
476+
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
477+
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
478+
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
479+
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
480+
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
481+
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
482+
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
483+
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
484+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
485+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
486+
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
474487
;
475488
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
476489
%l.src.0 = load i32, ptr %gep.src.0, align 4
@@ -489,25 +502,34 @@ define i32 @reduce_add_after_mul(ptr %src) {
489502
}
490503

491504
define i32 @dot_product_i32(ptr %a, ptr %b) {
492-
; CHECK-LABEL: @dot_product_i32(
493-
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
494-
; CHECK-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
495-
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
496-
; CHECK-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
497-
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
498-
; CHECK-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
499-
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
500-
; CHECK-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
501-
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
502-
; CHECK-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
503-
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
504-
; CHECK-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
505-
; CHECK-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
506-
; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
507-
; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
508-
; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
509-
; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
510-
; CHECK-NEXT: ret i32 [[ADD_1]]
505+
; NON-POW2-LABEL: @dot_product_i32(
506+
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
507+
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
508+
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
509+
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
510+
; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
511+
; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
512+
; NON-POW2-NEXT: ret i32 [[TMP4]]
513+
;
514+
; POW2-ONLY-LABEL: @dot_product_i32(
515+
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
516+
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
517+
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
518+
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
519+
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
520+
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
521+
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
522+
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
523+
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
524+
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
525+
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
526+
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
527+
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
528+
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
529+
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
530+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
531+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
532+
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
511533
;
512534
%gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
513535
%l.a.0 = load i32, ptr %gep.a.0, align 4
@@ -533,22 +555,31 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
533555
}
534556

535557
define float @dot_product_fp32(ptr %a, ptr %b) {
536-
; CHECK-LABEL: @dot_product_fp32(
537-
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
538-
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
539-
; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
540-
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
541-
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
542-
; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
543-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
544-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
545-
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
546-
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
547-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
548-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
549-
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
550-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
551-
; CHECK-NEXT: ret float [[ADD_1]]
558+
; NON-POW2-LABEL: @dot_product_fp32(
559+
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
560+
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
561+
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
562+
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
563+
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
564+
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float -0.000000e+00, <3 x float> [[TMP3]])
565+
; NON-POW2-NEXT: ret float [[TMP4]]
566+
;
567+
; POW2-ONLY-LABEL: @dot_product_fp32(
568+
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
569+
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
570+
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
571+
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
572+
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
573+
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
574+
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
575+
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
576+
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
577+
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
578+
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
579+
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
580+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
581+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
582+
; POW2-ONLY-NEXT: ret float [[ADD_1]]
552583
;
553584
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
554585
%l.a.0 = load float, ptr %gep.a.0, align 4
@@ -574,22 +605,31 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
574605
}
575606

576607
define double @dot_product_fp64(ptr %a, ptr %b) {
577-
; CHECK-LABEL: @dot_product_fp64(
578-
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
579-
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
580-
; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
581-
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
582-
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
583-
; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
584-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
585-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
586-
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
587-
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
588-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
589-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
590-
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
591-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
592-
; CHECK-NEXT: ret double [[ADD_1]]
608+
; NON-POW2-LABEL: @dot_product_fp64(
609+
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
610+
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
611+
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
612+
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
613+
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
614+
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double -0.000000e+00, <3 x double> [[TMP3]])
615+
; NON-POW2-NEXT: ret double [[TMP4]]
616+
;
617+
; POW2-ONLY-LABEL: @dot_product_fp64(
618+
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
619+
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
620+
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
621+
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
622+
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
623+
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
624+
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
625+
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
626+
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
627+
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
628+
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
629+
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
630+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
631+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
632+
; POW2-ONLY-NEXT: ret double [[ADD_1]]
593633
;
594634
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
595635
%l.a.0 = load double, ptr %gep.a.0, align 4

0 commit comments

Comments
 (0)