Skip to content

Commit 662fa8e

Browse files
committed
[CGP,AArch64] Replace zexts with shuffle that can be lowered using tbl.
This patch extends CodeGenPrepare to lower zext v16i8 -> v16i32 in loops using a wide shuffle creating a v64i8 vector, selecting groups of 3 zero elements and an element from the input. This is profitable on AArch64 where such shuffles can be lowered to tbl instructions, but only in loops, because it requires materializing 4 masks, which can be done in the loop preheader. This is the only reason the transform is part of CGP. If there's a better alternative I missed, please let me know. The same goes for the shouldReplaceZExtWithShuffle hook which guards this. I am not sure if this transform will be beneficial on other targets, but it seems like there is no way other convenient way. This improves the generated code for loops like the one below in combination with D96522. int foo(uint8_t *p, int N) { unsigned long long sum = 0; for (int i = 0; i < N ; i++, p++) { unsigned int v = *p; sum += (v < 127) ? v : 256 - v; } return sum; } https://clang.godbolt.org/z/Wco866MjY Reviewed By: t.p.northover Differential Revision: https://reviews.llvm.org/D120571 (cherry-picked from 81a11da)
1 parent 6cf0c21 commit 662fa8e

File tree

7 files changed

+419
-79
lines changed

7 files changed

+419
-79
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class Constant;
6969
class FastISel;
7070
class FunctionLoweringInfo;
7171
class GlobalValue;
72+
class Loop;
7273
class GISelKnownBits;
7374
class IntrinsicInst;
7475
class IRBuilderBase;
@@ -2713,6 +2714,13 @@ class TargetLoweringBase {
27132714
return false;
27142715
}
27152716

2717+
/// Try to optimize extending or truncating conversion instructions (like
2718+
/// zext, trunc, fptoui, uitofp) for the target.
2719+
virtual bool optimizeExtendOrTruncateConversion(Instruction *I,
2720+
Loop *L) const {
2721+
return false;
2722+
}
2723+
27162724
/// Return true if the target supplies and combines to a paired load
27172725
/// two loaded values of type LoadedType next to each other in memory.
27182726
/// RequiredAlignment gives the minimal alignment constraints that must be met

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7808,6 +7808,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
78087808
TargetLowering::TypeExpandInteger) {
78097809
return SinkCast(CI);
78107810
} else {
7811+
if (TLI->optimizeExtendOrTruncateConversion(
7812+
I, LI->getLoopFor(I->getParent())))
7813+
return true;
7814+
78117815
bool MadeChange = optimizeExt(I);
78127816
return MadeChange | optimizeExtUses(I);
78137817
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llvm/ADT/StringRef.h"
3030
#include "llvm/ADT/Triple.h"
3131
#include "llvm/ADT/Twine.h"
32+
#include "llvm/Analysis/LoopInfo.h"
3233
#include "llvm/Analysis/MemoryLocation.h"
3334
#include "llvm/Analysis/ObjCARCUtil.h"
3435
#include "llvm/Analysis/VectorUtils.h"
@@ -12855,6 +12856,60 @@ bool AArch64TargetLowering::shouldSinkOperands(
1285512856
return false;
1285612857
}
1285712858

12859+
static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
12860+
Value *Op = ZExt->getOperand(0);
12861+
auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType());
12862+
auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType());
12863+
unsigned NumElts = SrcTy->getNumElements();
12864+
IRBuilder<> Builder(ZExt);
12865+
SmallVector<int> Mask(4 * NumElts, NumElts);
12866+
// Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
12867+
// replace the original ZExt. This can later be lowered to a set of tbl
12868+
// instructions.
12869+
for (unsigned i = 0; i < NumElts; i++) {
12870+
if (IsLittleEndian)
12871+
Mask[i * 4] = i;
12872+
else
12873+
Mask[i * 4 + 3] = i;
12874+
}
12875+
12876+
auto *FirstEltZero = Builder.CreateInsertElement(
12877+
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
12878+
Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
12879+
Result = Builder.CreateBitCast(Result, DstTy);
12880+
ZExt->replaceAllUsesWith(Result);
12881+
ZExt->eraseFromParent();
12882+
}
12883+
12884+
bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
12885+
Loop *L) const {
12886+
// Try to optimize conversions using tbl. This requires materializing constant
12887+
// index vectors, which can increase code size and add loads. Skip the
12888+
// transform unless the conversion is in a loop block guaranteed to execute
12889+
// and we are not optimizing for size.
12890+
Function *F = I->getParent()->getParent();
12891+
if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
12892+
F->hasOptSize())
12893+
return false;
12894+
12895+
auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
12896+
auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
12897+
if (!SrcTy || !DstTy)
12898+
return false;
12899+
12900+
// Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
12901+
// lowered to either 2 or 4 tbl instructions to insert the original i8
12902+
// elements into i32 lanes.
12903+
auto *ZExt = dyn_cast<ZExtInst>(I);
12904+
if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
12905+
SrcTy->getElementType()->isIntegerTy(8) &&
12906+
DstTy->getElementType()->isIntegerTy(32)) {
12907+
createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
12908+
return true;
12909+
}
12910+
return false;
12911+
}
12912+
1285812913
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
1285912914
Align &RequiredAligment) const {
1286012915
if (!LoadedType.isSimple() ||

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,9 @@ class AArch64TargetLowering : public TargetLowering {
600600
bool shouldSinkOperands(Instruction *I,
601601
SmallVectorImpl<Use *> &Ops) const override;
602602

603+
bool optimizeExtendOrTruncateConversion(Instruction *I,
604+
Loop *L) const override;
605+
603606
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
604607

605608
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

llvm/test/CodeGen/AArch64/vselect-ext.ll

Lines changed: 62 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -257,35 +257,53 @@ entry:
257257
define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) {
258258
; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32:
259259
; CHECK: ; %bb.0: ; %entry
260-
; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff
260+
; CHECK-NEXT: Lloh0:
261+
; CHECK-NEXT: adrp x9, lCPI8_0@PAGE
262+
; CHECK-NEXT: Lloh1:
263+
; CHECK-NEXT: adrp x10, lCPI8_1@PAGE
264+
; CHECK-NEXT: Lloh2:
265+
; CHECK-NEXT: adrp x11, lCPI8_2@PAGE
266+
; CHECK-NEXT: Lloh3:
267+
; CHECK-NEXT: adrp x12, lCPI8_3@PAGE
268+
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
261269
; CHECK-NEXT: mov x8, xzr
270+
; CHECK-NEXT: Lloh4:
271+
; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF]
272+
; CHECK-NEXT: Lloh5:
273+
; CHECK-NEXT: ldr q1, [x10, lCPI8_1@PAGEOFF]
274+
; CHECK-NEXT: Lloh6:
275+
; CHECK-NEXT: ldr q3, [x11, lCPI8_2@PAGEOFF]
276+
; CHECK-NEXT: Lloh7:
277+
; CHECK-NEXT: ldr q4, [x12, lCPI8_3@PAGEOFF]
262278
; CHECK-NEXT: LBB8_1: ; %loop
263279
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
264-
; CHECK-NEXT: ldr q1, [x0, x8]
280+
; CHECK-NEXT: ldr q5, [x0, x8]
265281
; CHECK-NEXT: add x8, x8, #16
266282
; CHECK-NEXT: cmp x8, #128
267-
; CHECK-NEXT: cmgt.16b v2, v1, v0
268-
; CHECK-NEXT: ushll2.8h v3, v1, #0
269-
; CHECK-NEXT: sshll2.8h v4, v2, #0
270-
; CHECK-NEXT: ushll2.4s v5, v3, #0
271-
; CHECK-NEXT: ushll.4s v3, v3, #0
272-
; CHECK-NEXT: sshll2.4s v6, v4, #0
273-
; CHECK-NEXT: sshll.4s v4, v4, #0
274-
; CHECK-NEXT: ushll.8h v1, v1, #0
275-
; CHECK-NEXT: sshll.8h v2, v2, #0
283+
; CHECK-NEXT: cmgt.16b v6, v5, v2
284+
; CHECK-NEXT: tbl.16b v7, { v5 }, v0
285+
; CHECK-NEXT: tbl.16b v16, { v5 }, v1
286+
; CHECK-NEXT: sshll2.8h v18, v6, #0
287+
; CHECK-NEXT: tbl.16b v17, { v5 }, v3
288+
; CHECK-NEXT: sshll2.4s v19, v18, #0
289+
; CHECK-NEXT: sshll.4s v18, v18, #0
290+
; CHECK-NEXT: tbl.16b v5, { v5 }, v4
291+
; CHECK-NEXT: sshll.8h v6, v6, #0
292+
; CHECK-NEXT: and.16b v7, v7, v19
293+
; CHECK-NEXT: and.16b v16, v16, v18
294+
; CHECK-NEXT: stp q16, q7, [x1, #32]
295+
; CHECK-NEXT: sshll2.4s v7, v6, #0
296+
; CHECK-NEXT: sshll.4s v6, v6, #0
297+
; CHECK-NEXT: and.16b v7, v17, v7
276298
; CHECK-NEXT: and.16b v5, v5, v6
277-
; CHECK-NEXT: and.16b v3, v3, v4
278-
; CHECK-NEXT: stp q3, q5, [x1, #32]
279-
; CHECK-NEXT: sshll2.4s v4, v2, #0
280-
; CHECK-NEXT: sshll.4s v2, v2, #0
281-
; CHECK-NEXT: ushll2.4s v3, v1, #0
282-
; CHECK-NEXT: ushll.4s v1, v1, #0
283-
; CHECK-NEXT: and.16b v3, v3, v4
284-
; CHECK-NEXT: and.16b v1, v1, v2
285-
; CHECK-NEXT: stp q1, q3, [x1], #64
299+
; CHECK-NEXT: stp q5, q7, [x1], #64
286300
; CHECK-NEXT: b.ne LBB8_1
287301
; CHECK-NEXT: ; %bb.2: ; %exit
288302
; CHECK-NEXT: ret
303+
; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
304+
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6
305+
; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5
306+
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4
289307
entry:
290308
br label %loop
291309

@@ -311,23 +329,23 @@ exit:
311329
define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
312330
; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
313331
; CHECK: ; %bb.0: ; %entry
314-
; CHECK-NEXT: Lloh0:
332+
; CHECK-NEXT: Lloh8:
315333
; CHECK-NEXT: adrp x9, lCPI9_0@PAGE
316-
; CHECK-NEXT: Lloh1:
334+
; CHECK-NEXT: Lloh9:
317335
; CHECK-NEXT: adrp x10, lCPI9_1@PAGE
318-
; CHECK-NEXT: Lloh2:
336+
; CHECK-NEXT: Lloh10:
319337
; CHECK-NEXT: adrp x11, lCPI9_2@PAGE
320-
; CHECK-NEXT: Lloh3:
338+
; CHECK-NEXT: Lloh11:
321339
; CHECK-NEXT: adrp x12, lCPI9_3@PAGE
322340
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
323341
; CHECK-NEXT: mov x8, xzr
324-
; CHECK-NEXT: Lloh4:
342+
; CHECK-NEXT: Lloh12:
325343
; CHECK-NEXT: ldr q0, [x9, lCPI9_0@PAGEOFF]
326-
; CHECK-NEXT: Lloh5:
344+
; CHECK-NEXT: Lloh13:
327345
; CHECK-NEXT: ldr q1, [x10, lCPI9_1@PAGEOFF]
328-
; CHECK-NEXT: Lloh6:
346+
; CHECK-NEXT: Lloh14:
329347
; CHECK-NEXT: ldr q3, [x11, lCPI9_2@PAGEOFF]
330-
; CHECK-NEXT: Lloh7:
348+
; CHECK-NEXT: Lloh15:
331349
; CHECK-NEXT: ldr q4, [x12, lCPI9_3@PAGEOFF]
332350
; CHECK-NEXT: LBB9_1: ; %loop
333351
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -354,10 +372,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
354372
; CHECK-NEXT: b.ne LBB9_1
355373
; CHECK-NEXT: ; %bb.2: ; %exit
356374
; CHECK-NEXT: ret
357-
; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
358-
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6
359-
; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5
360-
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4
375+
; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
376+
; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14
377+
; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh13
378+
; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh12
361379
entry:
362380
br label %loop
363381

@@ -384,23 +402,23 @@ exit:
384402
define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
385403
; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
386404
; CHECK: ; %bb.0: ; %entry
387-
; CHECK-NEXT: Lloh8:
405+
; CHECK-NEXT: Lloh16:
388406
; CHECK-NEXT: adrp x9, lCPI10_0@PAGE
389-
; CHECK-NEXT: Lloh9:
407+
; CHECK-NEXT: Lloh17:
390408
; CHECK-NEXT: adrp x10, lCPI10_1@PAGE
391-
; CHECK-NEXT: Lloh10:
409+
; CHECK-NEXT: Lloh18:
392410
; CHECK-NEXT: adrp x11, lCPI10_2@PAGE
393-
; CHECK-NEXT: Lloh11:
411+
; CHECK-NEXT: Lloh19:
394412
; CHECK-NEXT: adrp x12, lCPI10_3@PAGE
395413
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
396414
; CHECK-NEXT: mov x8, xzr
397-
; CHECK-NEXT: Lloh12:
415+
; CHECK-NEXT: Lloh20:
398416
; CHECK-NEXT: ldr q0, [x9, lCPI10_0@PAGEOFF]
399-
; CHECK-NEXT: Lloh13:
417+
; CHECK-NEXT: Lloh21:
400418
; CHECK-NEXT: ldr q1, [x10, lCPI10_1@PAGEOFF]
401-
; CHECK-NEXT: Lloh14:
419+
; CHECK-NEXT: Lloh22:
402420
; CHECK-NEXT: ldr q3, [x11, lCPI10_2@PAGEOFF]
403-
; CHECK-NEXT: Lloh15:
421+
; CHECK-NEXT: Lloh23:
404422
; CHECK-NEXT: ldr q4, [x12, lCPI10_3@PAGEOFF]
405423
; CHECK-NEXT: LBB10_1: ; %loop
406424
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -427,10 +445,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
427445
; CHECK-NEXT: b.ne LBB10_1
428446
; CHECK-NEXT: ; %bb.2: ; %exit
429447
; CHECK-NEXT: ret
430-
; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
431-
; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14
432-
; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh13
433-
; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh12
448+
; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23
449+
; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22
450+
; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21
451+
; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh20
434452
entry:
435453
br label %loop
436454

0 commit comments

Comments
 (0)