Skip to content

Commit f609d75

Browse files
committed
[AArch64] Do not generate uitofp(ld4) where and/shift can be used.
After llvm#107201 and llvm#107367 the codegen for zext(ld4) can use and / shift to extract the lanes out of the original vectors elements. This avoids gthe need for the expensive ld4 operations. so can lead to performance improvements. This patch stops the generation of ld4 for uitofp(ld4) that would become uitofp(zext(ld4)). It doesn't handle zext yet to make sure that widening instructions like mull and addl are not adversely affected.
1 parent 3b426a8 commit f609d75

File tree

2 files changed

+58
-46
lines changed

2 files changed

+58
-46
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17079,6 +17079,16 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
1707917079
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
1708017080
return false;
1708117081

17082+
// Check if the interleave is a zext(shuffle), that can be better optimized
17083+
// into shift / and masks. For the moment we do this just for uitofp (not
17084+
// zext) to avoid issues with widening instructions.
17085+
if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17086+
return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17087+
SI->getType()->getScalarSizeInBits() * 4 ==
17088+
SI->user_back()->getType()->getScalarSizeInBits();
17089+
}))
17090+
return false;
17091+
1708217092
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
1708317093

1708417094
auto *FVTy = cast<FixedVectorType>(VTy);

llvm/test/CodeGen/AArch64/zext-shuffle.ll

Lines changed: 48 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -472,59 +472,61 @@ define <8 x double> @uitofp_fadd(<32 x i16> %l) {
472472
define <8 x double> @uitofp_load_fadd(ptr %p) {
473473
; CHECK-LABEL: uitofp_load_fadd:
474474
; CHECK: // %bb.0:
475-
; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
476-
; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0
477-
; CHECK-NEXT: ushll v5.4s, v0.4h, #0
478-
; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0
479-
; CHECK-NEXT: ushll v7.4s, v1.4h, #0
480-
; CHECK-NEXT: ushll2 v16.4s, v2.8h, #0
481-
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
482-
; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
483-
; CHECK-NEXT: ushll v0.4s, v3.4h, #0
484-
; CHECK-NEXT: ushll2 v1.2d, v4.4s, #0
485-
; CHECK-NEXT: ushll2 v2.2d, v5.4s, #0
486-
; CHECK-NEXT: ushll v3.2d, v4.2s, #0
487-
; CHECK-NEXT: ushll v4.2d, v5.2s, #0
488-
; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0
489-
; CHECK-NEXT: ushll2 v19.2d, v7.4s, #0
490-
; CHECK-NEXT: ushll v6.2d, v6.2s, #0
491-
; CHECK-NEXT: ushll v7.2d, v7.2s, #0
492-
; CHECK-NEXT: ushll2 v20.2d, v16.4s, #0
493-
; CHECK-NEXT: ushll2 v21.2d, v17.4s, #0
494-
; CHECK-NEXT: ushll v16.2d, v16.2s, #0
495-
; CHECK-NEXT: ushll v17.2d, v17.2s, #0
496-
; CHECK-NEXT: ushll v22.2d, v0.2s, #0
497-
; CHECK-NEXT: ushll2 v23.2d, v18.4s, #0
498-
; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
499-
; CHECK-NEXT: ushll v18.2d, v18.2s, #0
500-
; CHECK-NEXT: ucvtf v1.2d, v1.2d
501-
; CHECK-NEXT: ucvtf v2.2d, v2.2d
502-
; CHECK-NEXT: ucvtf v3.2d, v3.2d
503-
; CHECK-NEXT: ucvtf v4.2d, v4.2d
504-
; CHECK-NEXT: ucvtf v5.2d, v5.2d
475+
; CHECK-NEXT: ldp q1, q2, [x0]
476+
; CHECK-NEXT: movi v0.2d, #0x0000000000ffff
477+
; CHECK-NEXT: ldp q3, q4, [x0, #32]
478+
; CHECK-NEXT: ushr v5.2d, v1.2d, #16
479+
; CHECK-NEXT: ushr v6.2d, v2.2d, #16
480+
; CHECK-NEXT: ushr v20.2d, v1.2d, #32
481+
; CHECK-NEXT: ushr v7.2d, v3.2d, #16
482+
; CHECK-NEXT: ushr v17.2d, v4.2d, #16
483+
; CHECK-NEXT: ushr v22.2d, v2.2d, #32
484+
; CHECK-NEXT: ushr v23.2d, v3.2d, #32
485+
; CHECK-NEXT: ushr v24.2d, v4.2d, #32
486+
; CHECK-NEXT: and v16.16b, v1.16b, v0.16b
487+
; CHECK-NEXT: and v18.16b, v2.16b, v0.16b
488+
; CHECK-NEXT: and v19.16b, v3.16b, v0.16b
489+
; CHECK-NEXT: and v21.16b, v4.16b, v0.16b
490+
; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
491+
; CHECK-NEXT: and v6.16b, v6.16b, v0.16b
492+
; CHECK-NEXT: and v7.16b, v7.16b, v0.16b
493+
; CHECK-NEXT: and v17.16b, v17.16b, v0.16b
494+
; CHECK-NEXT: and v20.16b, v20.16b, v0.16b
495+
; CHECK-NEXT: and v22.16b, v22.16b, v0.16b
496+
; CHECK-NEXT: and v23.16b, v23.16b, v0.16b
497+
; CHECK-NEXT: and v0.16b, v24.16b, v0.16b
498+
; CHECK-NEXT: ushr v1.2d, v1.2d, #48
499+
; CHECK-NEXT: ushr v2.2d, v2.2d, #48
500+
; CHECK-NEXT: ushr v3.2d, v3.2d, #48
501+
; CHECK-NEXT: ushr v4.2d, v4.2d, #48
502+
; CHECK-NEXT: ucvtf v16.2d, v16.2d
503+
; CHECK-NEXT: ucvtf v18.2d, v18.2d
505504
; CHECK-NEXT: ucvtf v19.2d, v19.2d
505+
; CHECK-NEXT: ucvtf v21.2d, v21.2d
506+
; CHECK-NEXT: ucvtf v5.2d, v5.2d
506507
; CHECK-NEXT: ucvtf v6.2d, v6.2d
507508
; CHECK-NEXT: ucvtf v7.2d, v7.2d
508-
; CHECK-NEXT: ucvtf v20.2d, v20.2d
509-
; CHECK-NEXT: ucvtf v21.2d, v21.2d
510-
; CHECK-NEXT: ucvtf v16.2d, v16.2d
511509
; CHECK-NEXT: ucvtf v17.2d, v17.2d
510+
; CHECK-NEXT: ucvtf v20.2d, v20.2d
512511
; CHECK-NEXT: ucvtf v22.2d, v22.2d
513512
; CHECK-NEXT: ucvtf v23.2d, v23.2d
514513
; CHECK-NEXT: ucvtf v0.2d, v0.2d
515-
; CHECK-NEXT: ucvtf v18.2d, v18.2d
516-
; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d
517-
; CHECK-NEXT: fadd v4.2d, v4.2d, v7.2d
518-
; CHECK-NEXT: fadd v6.2d, v3.2d, v6.2d
519-
; CHECK-NEXT: fadd v2.2d, v2.2d, v19.2d
520-
; CHECK-NEXT: fadd v3.2d, v17.2d, v22.2d
521-
; CHECK-NEXT: fadd v5.2d, v16.2d, v18.2d
522-
; CHECK-NEXT: fadd v7.2d, v21.2d, v0.2d
523-
; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d
524-
; CHECK-NEXT: fadd v0.2d, v4.2d, v3.2d
525-
; CHECK-NEXT: fadd v3.2d, v1.2d, v16.2d
526-
; CHECK-NEXT: fadd v1.2d, v2.2d, v7.2d
527-
; CHECK-NEXT: fadd v2.2d, v6.2d, v5.2d
514+
; CHECK-NEXT: ucvtf v1.2d, v1.2d
515+
; CHECK-NEXT: ucvtf v2.2d, v2.2d
516+
; CHECK-NEXT: ucvtf v3.2d, v3.2d
517+
; CHECK-NEXT: ucvtf v4.2d, v4.2d
518+
; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d
519+
; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d
520+
; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d
521+
; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d
522+
; CHECK-NEXT: fadd v1.2d, v20.2d, v1.2d
523+
; CHECK-NEXT: fadd v3.2d, v23.2d, v3.2d
524+
; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d
525+
; CHECK-NEXT: fadd v4.2d, v0.2d, v4.2d
526+
; CHECK-NEXT: fadd v0.2d, v5.2d, v1.2d
527+
; CHECK-NEXT: fadd v1.2d, v6.2d, v2.2d
528+
; CHECK-NEXT: fadd v2.2d, v7.2d, v3.2d
529+
; CHECK-NEXT: fadd v3.2d, v17.2d, v4.2d
528530
; CHECK-NEXT: ret
529531
%l = load <32 x i16>, ptr %p
530532
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>

0 commit comments

Comments
 (0)