Skip to content

Commit f634096

Browse files
committed
[ARM] Perform lane interleaving from reductions.
We have a pass for MVE to perform lane interleaving to make use of top/bottom instructions, that adds shuffles before extends and after truncates. This extends it to also start from add reductions, where the order of lanes does not matter so the shuffle is not needed. We need to be careful about not breaking the form of existing reductions, but otherwise can save some instructions and awkward extends. Differential Revision: https://reviews.llvm.org/D143396
1 parent aa661a1 commit f634096

File tree

2 files changed

+64
-48
lines changed

2 files changed

+64
-48
lines changed

llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
154154
static bool tryInterleave(Instruction *Start,
155155
SmallPtrSetImpl<Instruction *> &Visited) {
156156
LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");
157-
auto *VT = cast<FixedVectorType>(Start->getType());
158157

159158
if (!isa<Instruction>(Start->getOperand(0)))
160159
return false;
@@ -165,6 +164,7 @@ static bool tryInterleave(Instruction *Start,
165164
Worklist.push_back(cast<Instruction>(Start->getOperand(0)));
166165

167166
SmallSetVector<Instruction *, 4> Truncs;
167+
SmallSetVector<Instruction *, 4> Reducts;
168168
SmallSetVector<Instruction *, 4> Exts;
169169
SmallSetVector<Use *, 4> OtherLeafs;
170170
SmallSetVector<Instruction *, 4> Ops;
@@ -198,6 +198,13 @@ static bool tryInterleave(Instruction *Start,
198198
if (!II)
199199
return false;
200200

201+
if (II->getIntrinsicID() == Intrinsic::vector_reduce_add) {
202+
if (!Reducts.insert(I))
203+
continue;
204+
Visited.insert(I);
205+
break;
206+
}
207+
201208
switch (II->getIntrinsicID()) {
202209
case Intrinsic::abs:
203210
case Intrinsic::smin:
@@ -267,21 +274,32 @@ static bool tryInterleave(Instruction *Start,
267274
return false;
268275

269276
LLVM_DEBUG({
270-
dbgs() << "Found group:\n Exts:";
277+
dbgs() << "Found group:\n Exts:\n";
271278
for (auto *I : Exts)
272279
dbgs() << " " << *I << "\n";
273-
dbgs() << " Ops:";
280+
dbgs() << " Ops:\n";
274281
for (auto *I : Ops)
275282
dbgs() << " " << *I << "\n";
276-
dbgs() << " OtherLeafs:";
283+
dbgs() << " OtherLeafs:\n";
277284
for (auto *I : OtherLeafs)
278285
dbgs() << " " << *I->get() << " of " << *I->getUser() << "\n";
279-
dbgs() << "Truncs:";
286+
dbgs() << " Truncs:\n";
280287
for (auto *I : Truncs)
281288
dbgs() << " " << *I << "\n";
289+
dbgs() << " Reducts:\n";
290+
for (auto *I : Reducts)
291+
dbgs() << " " << *I << "\n";
282292
});
283293

284-
assert(!Truncs.empty() && "Expected some truncs");
294+
assert((!Truncs.empty() || !Reducts.empty()) &&
295+
"Expected some truncs or reductions");
296+
if (Truncs.empty() && Exts.empty())
297+
return false;
298+
299+
auto *VT = !Truncs.empty()
300+
? cast<FixedVectorType>(Truncs[0]->getType())
301+
: cast<FixedVectorType>(Exts[0]->getOperand(0)->getType());
302+
LLVM_DEBUG(dbgs() << "Using VT:" << *VT << "\n");
285303

286304
// Check types
287305
unsigned NumElts = VT->getNumElements();
@@ -311,6 +329,14 @@ static bool tryInterleave(Instruction *Start,
311329
// Check that it looks beneficial
312330
if (!isProfitableToInterleave(Exts, Truncs))
313331
return false;
332+
if (!Reducts.empty() && (Ops.empty() || all_of(Ops, [](Instruction *I) {
333+
return I->getOpcode() == Instruction::Mul ||
334+
I->getOpcode() == Instruction::Select ||
335+
I->getOpcode() == Instruction::ICmp;
336+
}))) {
337+
LLVM_DEBUG(dbgs() << "Reduction does not look profitable\n");
338+
return false;
339+
}
314340

315341
// Create new shuffles around the extends / truncs / other leaves.
316342
IRBuilder<> Builder(Start);
@@ -367,6 +393,14 @@ static bool tryInterleave(Instruction *Start,
367393
return true;
368394
}
369395

396+
// Add reductions are fairly common and associative, meaning we can start the
397+
// interleaving from them and don't need to emit a shuffle.
398+
static bool isAddReduction(Instruction &I) {
399+
if (auto *II = dyn_cast<IntrinsicInst>(&I))
400+
return II->getIntrinsicID() == Intrinsic::vector_reduce_add;
401+
return false;
402+
}
403+
370404
bool MVELaneInterleaving::runOnFunction(Function &F) {
371405
if (!EnableInterleave)
372406
return false;
@@ -380,8 +414,10 @@ bool MVELaneInterleaving::runOnFunction(Function &F) {
380414

381415
SmallPtrSet<Instruction *, 16> Visited;
382416
for (Instruction &I : reverse(instructions(F))) {
383-
if (I.getType()->isVectorTy() &&
384-
(isa<TruncInst>(I) || isa<FPTruncInst>(I)) && !Visited.count(&I))
417+
if (((I.getType()->isVectorTy() &&
418+
(isa<TruncInst>(I) || isa<FPTruncInst>(I))) ||
419+
isAddReduction(I)) &&
420+
!Visited.count(&I))
385421
Changed |= tryInterleave(&I, Visited);
386422
}
387423

llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll

Lines changed: 20 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,12 @@
44
define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_mul(<16 x i8> %s0, <16 x i8> %s1) {
55
; CHECK-LABEL: reduce_v16i16_shift_mul:
66
; CHECK: @ %bb.0: @ %entry
7-
; CHECK-NEXT: .pad #32
8-
; CHECK-NEXT: sub sp, #32
9-
; CHECK-NEXT: add r0, sp, #16
10-
; CHECK-NEXT: mov r1, sp
11-
; CHECK-NEXT: vstrw.32 q1, [r0]
12-
; CHECK-NEXT: vstrw.32 q0, [r1]
13-
; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
14-
; CHECK-NEXT: vldrb.u16 q1, [r1, #8]
15-
; CHECK-NEXT: vldrb.u16 q2, [r1]
16-
; CHECK-NEXT: vmul.i16 q0, q1, q0
17-
; CHECK-NEXT: vldrb.u16 q1, [r0]
7+
; CHECK-NEXT: vmullt.u8 q2, q0, q1
8+
; CHECK-NEXT: vmullb.u8 q0, q0, q1
9+
; CHECK-NEXT: vshr.s16 q2, q2, #14
1810
; CHECK-NEXT: vshr.s16 q0, q0, #14
19-
; CHECK-NEXT: vmul.i16 q1, q2, q1
20-
; CHECK-NEXT: vaddv.u16 r0, q0
21-
; CHECK-NEXT: vshr.s16 q1, q1, #14
22-
; CHECK-NEXT: vaddva.u16 r0, q1
23-
; CHECK-NEXT: add sp, #32
11+
; CHECK-NEXT: vaddv.u16 r0, q2
12+
; CHECK-NEXT: vaddva.u16 r0, q0
2413
; CHECK-NEXT: bx lr
2514
entry:
2615
%s0s = zext <16 x i8> %s0 to <16 x i16>
@@ -50,23 +39,16 @@ entry:
5039
define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_sub(<16 x i8> %s0, <16 x i8> %s1) {
5140
; CHECK-LABEL: reduce_v16i16_shift_sub:
5241
; CHECK: @ %bb.0: @ %entry
53-
; CHECK-NEXT: .pad #32
54-
; CHECK-NEXT: sub sp, #32
55-
; CHECK-NEXT: add r0, sp, #16
56-
; CHECK-NEXT: mov r1, sp
57-
; CHECK-NEXT: vstrw.32 q1, [r0]
58-
; CHECK-NEXT: vstrw.32 q0, [r1]
59-
; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
60-
; CHECK-NEXT: vldrb.u16 q1, [r1, #8]
61-
; CHECK-NEXT: vldrb.u16 q2, [r1]
62-
; CHECK-NEXT: vsub.i16 q0, q1, q0
63-
; CHECK-NEXT: vldrb.u16 q1, [r0]
42+
; CHECK-NEXT: vmovlt.u8 q2, q1
43+
; CHECK-NEXT: vmovlt.u8 q3, q0
44+
; CHECK-NEXT: vsub.i16 q2, q3, q2
45+
; CHECK-NEXT: vmovlb.u8 q1, q1
46+
; CHECK-NEXT: vmovlb.u8 q0, q0
47+
; CHECK-NEXT: vshr.s16 q2, q2, #14
48+
; CHECK-NEXT: vsub.i16 q0, q0, q1
49+
; CHECK-NEXT: vaddv.u16 r0, q2
6450
; CHECK-NEXT: vshr.s16 q0, q0, #14
65-
; CHECK-NEXT: vsub.i16 q1, q2, q1
66-
; CHECK-NEXT: vaddv.u16 r0, q0
67-
; CHECK-NEXT: vshr.s16 q1, q1, #14
68-
; CHECK-NEXT: vaddva.u16 r0, q1
69-
; CHECK-NEXT: add sp, #32
51+
; CHECK-NEXT: vaddva.u16 r0, q0
7052
; CHECK-NEXT: bx lr
7153
entry:
7254
%s0s = zext <16 x i8> %s0 to <16 x i16>
@@ -190,17 +172,15 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
190172
; CHECK-NEXT: .LBB4_8: @ %vector.body
191173
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
192174
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
193-
; CHECK-NEXT: vldrh.s32 q2, [r5], #16
194-
; CHECK-NEXT: vldrh.s32 q1, [r4], #16
175+
; CHECK-NEXT: vldrh.u16 q1, [r4], #16
176+
; CHECK-NEXT: vldrh.u16 q2, [r5], #16
195177
; CHECK-NEXT: rsb.w r1, r12, #0
196-
; CHECK-NEXT: vmul.i32 q1, q2, q1
197-
; CHECK-NEXT: vldrh.s32 q2, [r4, #-8]
198-
; CHECK-NEXT: vldrh.s32 q3, [r5, #-8]
178+
; CHECK-NEXT: vmullb.s16 q3, q2, q1
179+
; CHECK-NEXT: vmullt.s16 q1, q2, q1
180+
; CHECK-NEXT: vshl.s32 q3, r1
199181
; CHECK-NEXT: vshl.s32 q1, r1
182+
; CHECK-NEXT: vaddva.u32 r6, q3
200183
; CHECK-NEXT: vaddva.u32 r6, q1
201-
; CHECK-NEXT: vmul.i32 q2, q3, q2
202-
; CHECK-NEXT: vshl.s32 q2, r1
203-
; CHECK-NEXT: vaddva.u32 r6, q2
204184
; CHECK-NEXT: le lr, .LBB4_8
205185
; CHECK-NEXT: @ %bb.9: @ %middle.block
206186
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1

0 commit comments

Comments
 (0)