Skip to content

Commit a21fb74

Browse files
authored
[RISCV][TTI] Implement getPartialReductionCost for the vqdotq cases (#140974)
Doing so tells the loop vectorizer that the partial.reduce intrinsic is profitable to use over the plain extend/multiply/reduce.add sequence.
1 parent 23d4756 commit a21fb74

File tree

3 files changed

+177
-76
lines changed

3 files changed

+177
-76
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,29 @@ RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
294294
: TTI::PSK_Software;
295295
}
296296

297+
InstructionCost RISCVTTIImpl::getPartialReductionCost(
298+
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
299+
ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
300+
TTI::PartialReductionExtendKind OpBExtend,
301+
std::optional<unsigned> BinOp) const {
302+
303+
// zve32x is broken for partial_reduce_umla, but let's make sure we
304+
// don't generate them.
305+
if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
306+
Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
307+
InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
308+
OpAExtend != OpBExtend || !AccumType->isIntegerTy(32) ||
309+
!VF.isKnownMultipleOf(4) || !VF.isScalable())
310+
return InstructionCost::getInvalid();
311+
312+
Type *Tp = VectorType::get(AccumType, VF);
313+
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
314+
// Note: Asuming all vqdot* variants are equal cost
315+
// TODO: Thread CostKind through this API
316+
return LT.first * getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second,
317+
TTI::TCK_RecipThroughput);
318+
}
319+
297320
bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
298321
// Currently, the ExpandReductions pass can't expand scalable-vector
299322
// reductions, but we still request expansion as RVV doesn't support certain

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,13 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
107107
TargetTransformInfo::PopcntSupportKind
108108
getPopcntSupport(unsigned TyWidth) const override;
109109

110+
InstructionCost
111+
getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
112+
Type *AccumType, ElementCount VF,
113+
TTI::PartialReductionExtendKind OpAExtend,
114+
TTI::PartialReductionExtendKind OpBExtend,
115+
std::optional<unsigned> BinOp) const override;
116+
110117
bool shouldExpandReduction(const IntrinsicInst *II) const override;
111118
bool supportsScalableVectors() const override {
112119
return ST->hasVInstructions();

llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll

Lines changed: 147 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -5,42 +5,79 @@
55
target triple = "riscv64-none-unknown-elf"
66

77
define i32 @vqdot(ptr %a, ptr %b) #0 {
8-
; CHECK-LABEL: define i32 @vqdot(
9-
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
10-
; CHECK-NEXT: entry:
11-
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
12-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
13-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
14-
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15-
; CHECK: vector.ph:
16-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
17-
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
18-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
19-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
20-
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
21-
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
22-
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
23-
; CHECK: vector.body:
24-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
26-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
27-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
28-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
29-
; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
30-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
31-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
32-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
33-
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
34-
; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
35-
; CHECK-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
36-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
37-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
38-
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
39-
; CHECK: middle.block:
40-
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
41-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
42-
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
43-
; CHECK: scalar.ph:
8+
; V-LABEL: define i32 @vqdot(
9+
; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
10+
; V-NEXT: entry:
11+
; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
12+
; V-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
13+
; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
14+
; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15+
; V: vector.ph:
16+
; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
17+
; V-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
18+
; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
19+
; V-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
20+
; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
21+
; V-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
22+
; V-NEXT: br label [[VECTOR_BODY:%.*]]
23+
; V: vector.body:
24+
; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25+
; V-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
26+
; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
27+
; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
28+
; V-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
29+
; V-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
30+
; V-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
31+
; V-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
32+
; V-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
33+
; V-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
34+
; V-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
35+
; V-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
36+
; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
37+
; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
38+
; V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
39+
; V: middle.block:
40+
; V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
41+
; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
42+
; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
43+
; V: scalar.ph:
44+
;
45+
; ZVQDOTQ-LABEL: define i32 @vqdot(
46+
; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
47+
; ZVQDOTQ-NEXT: entry:
48+
; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
49+
; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
50+
; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
51+
; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
52+
; ZVQDOTQ: vector.ph:
53+
; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
54+
; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
55+
; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
56+
; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
57+
; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
58+
; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
59+
; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]]
60+
; ZVQDOTQ: vector.body:
61+
; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
62+
; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
63+
; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
64+
; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
65+
; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
66+
; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
67+
; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
68+
; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
69+
; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
70+
; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
71+
; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
72+
; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
73+
; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
74+
; ZVQDOTQ-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
75+
; ZVQDOTQ-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
76+
; ZVQDOTQ: middle.block:
77+
; ZVQDOTQ-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
78+
; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
79+
; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
80+
; ZVQDOTQ: scalar.ph:
4481
;
4582
entry:
4683
br label %for.body
@@ -66,42 +103,79 @@ for.exit: ; preds = %for.body
66103

67104

68105
define i32 @vqdotu(ptr %a, ptr %b) #0 {
69-
; CHECK-LABEL: define i32 @vqdotu(
70-
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
71-
; CHECK-NEXT: entry:
72-
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
73-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
74-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
75-
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
76-
; CHECK: vector.ph:
77-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
78-
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
79-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
80-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
81-
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
82-
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
83-
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
84-
; CHECK: vector.body:
85-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
86-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
87-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
88-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
89-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
90-
; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
91-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
92-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
93-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
94-
; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
95-
; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
96-
; CHECK-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
97-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
98-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
99-
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
100-
; CHECK: middle.block:
101-
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
102-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
103-
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
104-
; CHECK: scalar.ph:
106+
; V-LABEL: define i32 @vqdotu(
107+
; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
108+
; V-NEXT: entry:
109+
; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
110+
; V-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
111+
; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
112+
; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
113+
; V: vector.ph:
114+
; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
115+
; V-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
116+
; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
117+
; V-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
118+
; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
119+
; V-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
120+
; V-NEXT: br label [[VECTOR_BODY:%.*]]
121+
; V: vector.body:
122+
; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
123+
; V-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
124+
; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
125+
; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
126+
; V-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
127+
; V-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
128+
; V-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
129+
; V-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
130+
; V-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
131+
; V-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
132+
; V-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
133+
; V-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
134+
; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
135+
; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
136+
; V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
137+
; V: middle.block:
138+
; V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
139+
; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
140+
; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
141+
; V: scalar.ph:
142+
;
143+
; ZVQDOTQ-LABEL: define i32 @vqdotu(
144+
; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
145+
; ZVQDOTQ-NEXT: entry:
146+
; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
147+
; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
148+
; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
149+
; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
150+
; ZVQDOTQ: vector.ph:
151+
; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
152+
; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
153+
; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
154+
; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
155+
; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
156+
; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
157+
; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]]
158+
; ZVQDOTQ: vector.body:
159+
; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
160+
; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
161+
; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
162+
; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
163+
; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
164+
; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
165+
; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
166+
; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
167+
; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
168+
; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
169+
; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
170+
; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
171+
; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
172+
; ZVQDOTQ-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
173+
; ZVQDOTQ-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
174+
; ZVQDOTQ: middle.block:
175+
; ZVQDOTQ-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
176+
; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
177+
; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
178+
; ZVQDOTQ: scalar.ph:
105179
;
106180
entry:
107181
br label %for.body
@@ -128,7 +202,7 @@ for.exit: ; preds = %for.body
128202

129203
define i32 @vqdotsu(ptr %a, ptr %b) #0 {
130204
; CHECK-LABEL: define i32 @vqdotsu(
131-
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
205+
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
132206
; CHECK-NEXT: entry:
133207
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
134208
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -245,6 +319,3 @@ for.body: ; preds = %for.body, %entry
245319
for.exit: ; preds = %for.body
246320
ret i32 %add
247321
}
248-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
249-
; V: {{.*}}
250-
; ZVQDOTQ: {{.*}}

0 commit comments

Comments
 (0)